class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        # self.exploration_theta = 0.085
        # self.exploration_sigma = 0.15
        self.exploration_theta = 0.070
        self.exploration_sigma = 0.20
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.70  # discount factor
        self.tau = 0.01  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""

    #name: is a name to use to save the netural Network models
    #load: load data from existing models or cretae an entirly new model
    def __init__(self, task, name, loadfile=False):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        self.name = name
        if loadfile:
            self.actor_local.model.load_weights("./weights/" + name +
                                                "_actor.h5")
            self.critic_local.model.load_weights("./weights/" + name +
                                                 "_critic.h5")

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15  #0.3 #original 0.15
        self.exploration_sigma = 0.3  #0.3 #original 0.3

        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self):
        self.noise.reset()
        state = self.task.reset()
        self.last_state = state
        return state

    def step(self, action, reward, next_state, done):
        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

    def act(self, states):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(states, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)

        #rewards = np.interp(rewards, (rewards.min(), rewards.max()), (-1, +1)) #TESTING to scale rewards to a small number.

        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def save_weights(self):
        self.actor_local.model.save_weights("./weights/" + self.name +
                                            "_actor.h5")
        self.critic_local.model.save_weights("./weights/" + self.name +
                                             "_critic.h5")

    #Notice that after training over a batch of experiences, we could just copy our newly learned weights (from the local model) to the target model.
    #However, individual batches can introduce a lot of variance into the process, so it's better to perform a soft update, controlled by the parameter tau.

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(
            target_weights
        ), "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)
class DDPG():
    """Reinforcement Learning agent that learns using DDPG."""
    def __init__(self, task, basename):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # learning rates
        self.actor_learning_rate = 0.0001
        self.critic_learning_rate = 0.001

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size,
                                 self.action_low, self.action_high,
                                 self.actor_learning_rate)
        self.actor_target = Actor(self.state_size, self.action_size,
                                  self.action_low, self.action_high,
                                  self.actor_learning_rate)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size,
                                   self.critic_learning_rate)
        self.critic_target = Critic(self.state_size, self.action_size,
                                    self.critic_learning_rate)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

        # keep track of the best run
        self.nEpisode = 0
        self.bestEpisode = []
        self.bestEpisodeAt = -1

        # logging business
        self.state_labels = self.task.get_state_labels()
        self.action_labels = [
            'ac{}'.format(i) for i in range(self.action_size)
        ]
        self.df_columns = [
            't'
        ] + self.state_labels.tolist() + self.action_labels + ['R']
        self.basename = os.path.join('log', basename)
        self.currentEpisode = []
        self.bestCumReward = -np.inf

    def reset_episode(self):
        self.noise.reset()
        self.last_state = self.task.reset()
        self.currentEpisode = []
        return self.last_state

    def act(self, state):
        """Returns actions for given state(s) as per current policy."""
        state = np.reshape(state, [-1, self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action +
                    self.noise.sample())  # add some noise for exploration

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.)
        states = np.vstack([e.state for e in experiences if e is not None])
        actions = np.array([e.action for e in experiences
                            if e is not None]).astype(np.float32).reshape(
                                -1, self.action_size)
        rewards = np.array([e.reward for e in experiences if e is not None
                            ]).astype(np.float32).reshape(-1, 1)
        dones = np.array([e.done for e in experiences
                          if e is not None]).astype(np.uint8).reshape(-1, 1)
        next_states = np.vstack(
            [e.next_state for e in experiences if e is not None])

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict_on_batch(next_states)
        Q_targets_next = self.critic_target.model.predict_on_batch(
            [next_states, actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones)
        self.critic_local.model.train_on_batch(x=[states, actions],
                                               y=Q_targets)

        # Train actor model (local)
        action_gradients = np.reshape(
            self.critic_local.get_action_gradients([states, actions, 0]),
            (-1, self.action_size))
        self.actor_local.train_fn([states, action_gradients,
                                   1])  # custom training function

        # Soft-update target models
        self.soft_update(self.critic_local.model, self.critic_target.model)
        self.soft_update(self.actor_local.model, self.actor_target.model)

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())

        assert len(local_weights) == len(target_weights),\
            "Local and target model parameters must have the same size"

        new_weights = self.tau * local_weights + (1 -
                                                  self.tau) * target_weights
        target_model.set_weights(new_weights)

    def step(self, action):
        last_state_variables = self.task.get_state_variables()
        last_t = self.task.sim.get_time()

        # call the model for state transition
        next_state, reward, done = self.task.step(action)

        # logging the current episode
        self.currentEpisode += [
            np.hstack([last_t, last_state_variables, action, reward])
        ]

        # Save experience / reward
        self.memory.add(self.last_state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)

        # Roll over last state and action
        self.last_state = next_state

        if done:
            # log the episode
            df = pd.DataFrame(self.currentEpisode, columns=self.df_columns)
            fn_i = '{}_{}'.format(self.basename, self.nEpisode)
            df.to_csv(fn_i + '.csv')
            cumR = df.R.sum()
            if len(df) > len(self.bestEpisode) or \
            (len(df) == len(self.bestEpisode) and cumR > self.bestCumReward):
                self.bestCumReward = cumR
                self.bestEpisode = df
                self.bestEpisodeAt = self.nEpisode
                self.plot_episode(df, self.nEpisode, fn_i)
            sys.stdout.write(
                "\rEp#{:4d} dur_{} cumR_{:5.3f} best@{} dur_{} cumR_{:5.3f} ".
                format(self.nEpisode,
                       len(self.bestEpisode), cumR, self.bestEpisodeAt,
                       len(self.bestEpisode), self.bestCumReward))
            self.nEpisode += 1
        return next_state, done

    def train(self, num_episodes=1):
        for ep_i in range(num_episodes):
            state, done = self.reset_episode(), False
            while not done:
                action = self.act(state)
                state, done = self.step(action)

    def plot_episode(self, df, episNo, filename=''):
        fig = plt.figure(1)
        fig.clf()
        ax2 = fig.add_subplot(313)
        ax1 = fig.add_subplot(312, sharex=ax2)
        ax0 = fig.add_subplot(311, sharex=ax2)
        # plot selected state variables
        ax0.set_title('Ep#{} dur={:5.2f} sec'.format(episNo, df.t.iloc[-1]))
        df.plot(x='t', y=self.state_labels[:6], ax=ax0, style='.:')
        df.plot(x='t', y=self.state_labels[6:], ax=ax1, style='.:')
        df.plot(x='t', y=self.action_labels, ax=ax2, style='.:')
        df.plot(x='t', y='R', ax=ax2, secondary_y=True)
        plt.ylabel('Reward')
        plt.show()
        if len(filename) > 0:
            fig.savefig(filename)
Exemple #4
0
class DDPG():
    """Reinforcement learning agent who learns using DDPG"""

    def __init__(self,task):
        """Initialize models"""
        self.env = task
        self.state_size = task.observation_space.shape[0]
        self.action_size = task.action_space.shape[0]
        self.action_high = task.action_space.high
        self.action_low = task.action_space.low
        

        # Initialize Actor (policy) models
        self.actor_local = Actor(self.state_size,self.action_size,self.action_low,self.action_high)
        self.actor_target = Actor(self.state_size,self.action_size,self.action_low,self.action_high)

        # Initialize Critic (value) models
        self.critic_local = Critic(self.state_size,self.action_size)
        self.critic_target = Critic(self.state_size,self.action_size)

        # Initialize target model parameters with local model parameters
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay buffer

        self.buffer_size = 100000 
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size,self.batch_size)

         # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters

    def reset_episode(self,task):
        """Return state after reseting task"""
        self.noise.reset()
        state = task.reset()
        self.last_state = state
        return state

    def step(self,action,reward,next_state,done):
        # Add experience to memory
        self.memory.add_experience(self.last_state,action,reward,next_state,done)

        # Learn is memory is larger than batch size
        if len(self.memory) > self.batch_size:
            experiences = self.memory.sample()
            self.learn(experiences)
        
        # Roll over state
        self.last_state = next_state

    def act(self,state):
        """Returns action using the policy network """
        state = np.reshape(state,[-1,self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action+self.noise.sample())

    def learn(self,experiences):
        # Convert experience tuples to separate arrays for each element

        states = np.vstack([e.state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)
        actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.action_size) 
        next_states = np.vstack([e.next_state for e in experiences if e is not None]).astype(np.float32).reshape(-1,self.state_size)
        rewards = np.array([e.reward for e in experiences if e is not None]).astype(np.float32).reshape(-1,1)
        dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1,1)

        # Get predicted next-state actions and Q values from target models
        #     Q_targets_next = critic_target(next_state, actor_target(next_state))
        actions_next = self.actor_target.model.predict(next_states)
        Q_targets_next = self.critic_target.model.predict([next_states,actions_next])

        # Compute Q targets for current states and train critic model (local)
        Q_targets = rewards + self.gamma*Q_targets_next*(1-dones)
        self.critic_local.model.train_on_batch(x=[states,actions],y=Q_targets)
        # Train actor model (local)
        action_gradients = np.reshape(self.critic_local.get_action_gradients([states,actions,0]),
        [-1,self.action_size])
        self.actor_local.train_fn([states,action_gradients,1])

        # Soft-update target models
        self.soft_update(self.actor_local.model,self.actor_target.model)
        self.soft_update(self.critic_local.model,self.critic_target.model)

    def soft_update(self,local_model,target_model):
        local_weights = np.array(local_model.get_weights())
        target_weights = np.array(target_model.get_weights())
        assert len(local_weights) == len(target_weights), "Local and target model parameters must have the same size"

        new_weights = self.tau*local_weights + (1-self.tau)*target_weights
        target_model.set_weights(new_weights)

    def save_model(self,path):
        self.actor_local.model.save_weights(path)

    def load_model(self,path):
        self.actor_local.model.load_weights(path)

    def act_only(self,state):
        state = np.reshape(state,[-1,self.state_size])
        action = self.actor_local.model.predict(state)[0]
        return list(action)
Exemple #5
0
class Task():
    def __init__(self,
                 runtime=5.,
                 init_pose=np.array([0.0, 0.0, 10.0, 0.0, 0.0, 0.0]),
                 init_velocities=np.array([0.0, 0.0, 0.0]),
                 init_angle_velocities=np.array([0.0, 0.0, 0.0]),
                 pos_noise=0.25,
                 angle_noise=None,
                 velocity_noise=0.15,
                 velocity_angle_noise=None,
                 target_pos=np.array([0.0, 0.0, 10.0])):

        self.target_pos = target_pos
        self.pos_noise = pos_noise
        self.angle_noise = angle_noise
        self.velocity_noise = velocity_noise
        self.velocity_angle_noise = velocity_angle_noise
        self.action_size = 1
        self.action_repeat = 1
        self.action_high = 1.2 * 400
        self.action_low = 0.99 * 400
        self.noise = OUNoise(self.action_size, mu=0.0, theta=0.2, sigma=0.1)
        self.action_b = (self.action_high + self.action_low) / 2.0
        self.action_m = (self.action_high - self.action_low) / 2.0

        # Simulation
        self.sim = PhysicsSim(init_pose, init_velocities,
                              init_angle_velocities, runtime)
        self.state_size = len(self.get_state())

    def get_reward(self):
        """Uses current pose of sim to return reward."""

        # reward = np.tanh(1 - 0.7 * (abs(self.sim.pose[:3] - self.target_pos))).sum()
        # print("reward ", reward)

        # reward = np.square(self.sim.pose[:3] - self.target_pos).sum()
        # reward = np.sqrt(reward)
        # reward /=3
        # print("\n")
        # print("self.sim.pose ", self.sim.pose)
        # print("self.target_pos ", self.target_pos)
        # np.clip(reward, 10, -10)
        # reward /= 10
        # reward = 1.-.3*(abs(self.sim.pose[:3] - self.target_pos)).sum()
        # reward = np.tanh(1 - 0.003 * (abs(self.sim.pose[:3] - self.target_pos))).sum()

        # reward = np.tanh(1 - 0.3 * (abs(self.sim.pose[:3] - self.target_pos))).sum()

        # reward = (1.5 - np.sum(np.square(( self.sim.pose[:3] -  self.target_pos) / 300.0))) * 2

        # reward = np.tanh( 1.-0.3*(abs(self.sim.pose[:3] - self.target_pos)).sum())
        # reward = (0.5 - np.mean(np.square((self.sim.pose[:3] - self.target_pos) / 200.0))) * 2
        # reward = (0.5 - np.mean(np.square((self.sim.pose[:3] - self.target_pos) / 300.0))) * 2

        # reward = 1. - .3 * (abs(self.sim.pose[:3] - self.target_pos)).sum()
        # if(self.sim.pose[2] >0) :
        #     reward +=  10 #abs(self.sim.pose[2] - self.target_pos[2])
        # else:
        #     reward -=  10 #abs(self.sim.pose[2] - self.target_pos[2])
        # reward = np.tanh(reward)
        # reward = np.tanh(1 - np.mean(np.square(self.sim.pose[:3] - self.target_pos)))
        # reward=0
        # if (self.sim.pose[2] > 0):
        #     reward +=1
        # if self.sim.pose[2] >= self.target_pos[2]:
        #     reward += 5
        # return reward
        # reward = self.sim.v[2] / 10.0
        # reward += (self.sim.pose[2] - self.target_pos[2]) / 10.0
        # reward -= np.linalg.norm(self.sim.pose[:2]) / 10.0
        # return reward
        #
        # p1 = self.sim.pose[:3]
        # p2 = self.target_pos
        # env_bounds = 300.0
        # bound = np.array([env_bounds, env_bounds, env_bounds])
        # reward = (0.5 - np.mean(np.square((p1 - p2) / bound))) * 2

        reward = np.tanh(1. - 0.76 *
                         (abs(self.sim.pose[:3] - self.target_pos)).sum())
        return reward

    def normalize_angles(self, angles):
        normalized_angles = np.copy(angles)
        for i in range(len(normalized_angles)):
            while normalized_angles[i] > np.pi:
                normalized_angles[i] -= 2 * np.pi
        return normalized_angles

    def get_state(self):
        position_error = (self.sim.pose[:3] - self.target_pos)
        return np.array(
            [position_error[2], self.sim.v[2], self.sim.linear_accel[2]])

    def step(self, actionInput):
        reward = 0
        # pose_all = []
        for _ in range(self.action_repeat):
            action = actionInput
            action += self.noise.sample()
            action = np.clip(action, -1, 1)
            speed_of_rotors = (action * self.action_m) + self.action_b
            done = self.sim.next_timestep(
                speed_of_rotors *
                np.ones(4))  # update the sim pose and velocities
            reward += self.get_reward()
            next_state = self.get_state()
            if reward <= 0:
                done = True
            # pose_all.append(self.sim.pose)
            if self.sim.pose[2] >= self.target_pos[2]:
                reward += 1
            # if self.sim.pose[2] >= self.target_pos[2]:
            #     reward += 10
            # else:
            #     reward += -20
            # if done:
            #     if self.sim.time < self.sim.runtime:
            #         reward += -1
            #     else:
            #         reward += 5
            #     break
        # next_state = np.concatenate(pose_all)
        return next_state, reward, done

    def reset_noise(self):
        self.noise.reset()

    def reset(self):
        self.sim.reset()
        self.noise.reset()
        rnd_pos = np.copy(self.sim.init_pose)
        rnd_pos[2] += np.random.normal(0.0, self.pos_noise, 1)
        self.sim.pose = np.copy(rnd_pos)
        rnd_velocity = np.copy(self.sim.init_velocities)
        rnd_velocity[2] += np.random.normal(0.0, self.velocity_noise, 1)
        self.sim.v = np.copy(rnd_velocity)

        return self.get_state()