Ejemplo n.º 1
0
    def test_combine_first_dimensions(self):
        x = np.random.randint(100, size=(3, 5, 7, 9))

        result = combine_first_dimensions(x)
        assert result.shape == (15, 7, 9)
        assert np.all(result[0] == x[0, 0])
        assert np.all(result[6] == x[1, 1])
        assert np.all(result[7] == x[1, 2])

        x = np.random.randint(10, size=(3, 5))
        assert np.all(combine_first_dimensions(x) == x.flatten())
Ejemplo n.º 2
0
    def train(self,
            n_step_rewards,
            mb_obs_combined,
            mb_actions_combined
    ):
        feed_dict = {
            self.ph_value_target: n_step_rewards,
            self.ph_selected_spatial_action: mb_actions_combined["spatial_action"],
            self.ph_selected_action_id: mb_actions_combined["action_id"],
            self.ph_is_spatial_action_available: mb_actions_combined["is_spatial_action_available"]
        }
        feed_dict.update(self.obs_to_feeddict(mb_obs_combined))

        # treat each timestep as a separate observation, so batch_size will become batch_size * timesteps
        feed_dict = {k: combine_first_dimensions(v) for k, v in feed_dict.items()}
        ops = [self.train_op]
        write_all_summaries = (
            (self.train_step % self.all_summary_freq == 0) and
            self.summary_path is not None
        )
        write_scalar_summaries = (
            (self.train_step % self.scalar_summary_freq == 0) and
            self.summary_path is not None
        )

        if write_all_summaries:
            ops.append(self.all_summary_op)
        elif write_scalar_summaries:
            ops.append(self.scalar_summary_op)

        r = self.sess.run(ops, feed_dict)

        if write_all_summaries or write_scalar_summaries:
            self.summary_writer.add_summary(r[-1], global_step=self.train_step)

        self.train_step += 1
Ejemplo n.º 3
0
    def run_batch(self):
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1),
                             dtype=np.float32)
        mb_rewards = np.zeros((self.envs.n_envs, self.n_steps),
                              dtype=np.float32)

        latest_obs = self.latest_obs  # state(t0)
        rnn_state = self.agent.state_init
        for n in range(self.n_steps):
            action_ids, spatial_action_2ds, value_estimate, rnn_state = self.agent.step(
                latest_obs, rnn_state)
            #print('step: ', n, action_ids, spatial_action_2ds, value_estimate)  # for debugging

            # Store actions and value estimates for all steps (this is being done in parallel with the other envs)
            mb_values[:, n] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids, spatial_action_2ds))
            # Do action, return it to environment, get new obs and reward, store reward
            actions_pp = self.action_processer.process(action_ids,
                                                       spatial_action_2ds)
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processer.process(obs_raw)  # state(t+1)
            mb_rewards[:, n] = [t.reward for t in obs_raw]

            #Check for all convolutional lstm vizdoom if last state is true
            for t in obs_raw:
                if t.last():
                    self._handle_episode_end(t)
        # Get the Vt+1 and use it as a future reward from st+1 as we dont know the actual reward (bootstraping here with net's predicitons)
        mb_values[:, -1] = self.agent.get_value(latest_obs, rnn_state)

        n_step_advantage = general_n_step_advantage(mb_rewards,
                                                    mb_values,
                                                    self.discount,
                                                    lambda_par=1.0)

        full_input = {
            # these are transposed because action/obs
            # processers return [time, env, ...] shaped arrays
            FEATURE_KEYS.advantage:
            n_step_advantage.transpose(),
            FEATURE_KEYS.value_target:
            (n_step_advantage + mb_values[:, :-1]).transpose()
        }
        # We combine all experiences from every env
        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        full_input = {
            k: combine_first_dimensions(v)
            for k, v in full_input.items()
        }

        if not self.do_training:
            pass
        else:
            self.agent.train(
                full_input, rnn_state
            )  # You might want to reset the state between forward and backward pass

        self.latest_obs = latest_obs  #state(t) = state(t+1)
        self.batch_counter += 1
        print('Batch %d finished' % self.batch_counter)
        sys.stdout.flush()
Ejemplo n.º 4
0
    def run_batch(self):
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1),
                             dtype=np.float32)
        mb_rewards = np.zeros((self.envs.n_envs, self.n_steps),
                              dtype=np.float32)
        mb_rewards_modified = np.zeros((self.envs.n_envs, self.n_steps),
                                       dtype=np.float32)

        latest_obs = self.latest_obs

        for n in range(self.n_steps):
            # could calculate value estimate from obs when do training
            # but saving values here will make n step reward calculation a bit easier
            action_ids, spatial_action_2ds, value_estimate = self.agent.step(
                latest_obs)

            mb_values[:, n] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids, spatial_action_2ds))

            actions_pp = self.action_processer.process(action_ids,
                                                       spatial_action_2ds)
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processer.process(obs_raw)
            mb_rewards[:, n] = [t.reward for t in obs_raw]
            # NEW
            i = 0
            last_dist = self.last_min_dist_to_enemy
            #print(last_dist)
            curr_dist = min_distance_to_enemy(obs_raw[0], minimap=True)
            #print(curr_dist)
            if last_dist < INF and curr_dist < INF:
                mb_rewards_modified[:, n] = [
                    t.reward + (last_dist - curr_dist) / 20 for t in obs_raw
                ]
            self.last_min_dist_to_enemy = curr_dist
            ###
            for t in obs_raw:
                if t.last():
                    self._handle_episode_end(t)

        mb_values[:, -1] = self.agent.get_value(latest_obs)

        n_step_advantage = general_n_step_advantage(
            mb_rewards,
            mb_rewards_modified,
            mb_values,
            self.discount,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0)

        full_input = {
            # these are transposed because action/obs
            # processers return [time, env, ...] shaped arrays
            FEATURE_KEYS.advantage:
            n_step_advantage.transpose(),
            FEATURE_KEYS.value_target:
            (n_step_advantage + mb_values[:, :-1]).transpose()
        }
        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        full_input = {
            k: combine_first_dimensions(v)
            for k, v in full_input.items()
        }

        if not self.do_training:
            pass
        elif self.agent.mode == ACMode.A2C:
            self.agent.train(full_input)
        elif self.agent.mode == ACMode.PPO:
            for epoch in range(self.ppo_par.n_epochs):
                self._train_ppo_epoch(full_input)
            self.agent.update_theta()

        self.latest_obs = latest_obs
        self.batch_counter += 1
        sys.stdout.flush()
Ejemplo n.º 5
0
    def run_batch(self):
        #(MINE) MAIN LOOP!!!
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.num_envs, self.n_steps + 1),
                             dtype=np.float32)
        mb_rewards = np.zeros((self.envs.num_envs, self.n_steps),
                              dtype=np.float32)
        mb_done = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.int32)

        latest_obs = self.latest_obs  # (MINE) =state(t)

        for n in range(self.n_steps):
            # could calculate value estimate from obs when do training
            # but saving values here will make n step reward calculation a bit easier
            action_ids, value_estimate = self.agent.step(latest_obs)
            print(
                '|step:', n, '|actions:', action_ids
            )  # (MINE) If you put it after the envs.step the SUCCESS appears at the envs.step so it will appear oddly
            # (MINE) Store actions and value estimates for all steps
            mb_values[:, n] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids))
            # (MINE)  do action, return it to environment, get new obs and reward, store reward
            #actions_pp = self.action_processer.process(action_ids) # Actions have changed now need to check: BEFORE: actions.FunctionCall(actions.FUNCTIONS.no_op.id, []) NOW: actions.FUNCTIONS.no_op()
            obs_raw = self.envs.step(action_ids)
            #obs_raw.reward = reward
            latest_obs = self.obs_processer.process(
                obs_raw[0]
            )  # For obs_raw as tuple! #(MINE) =state(t+1). Processes all inputs/obs from all timesteps (and envs)
            #print('-->|rewards:', np.round(np.mean(obs_raw[1]), 3))
            mb_rewards[:, n] = [t for t in obs_raw[1]]
            mb_done[:, n] = [t for t in obs_raw[2]]

            #Check for all t (timestep/observation in obs_raw which t has the last state true, meaning it is the last state
            # IF MAX_STEPS OR GOAL REACHED
            # You can use as below for obs_raw[4] which is success of failure
            #print(obs_raw[2])
            indx = 0
            for t in obs_raw[2]:
                if t == True:  # done=true
                    # Put reward in scores
                    epis_reward = obs_raw[3][indx]['episode']['r']
                    epis_length = obs_raw[3][indx]['episode']['l']
                    last_step_r = obs_raw[1][indx]
                    self._handle_episode_end(
                        epis_reward, epis_length, last_step_r
                    )  # The printing score process is NOT a parallel process apparrently as you input every reward (t) independently
                indx = indx + 1  # finished envs count
            # for t in obs_raw:
            #     if t.last():
            #         self._handle_episode_end(t)

        #print(">> Avg. Reward:",np.round(np.mean(mb_rewards),3))
        mb_values[:, -1] = self.agent.get_value(
            latest_obs
        )  # We bootstrap from last step if not terminal! although he doesnt use any check here

        n_step_advantage = general_n_step_advantage(
            mb_rewards,
            mb_values,
            self.discount,
            mb_done,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0)

        full_input = {
            # these are transposed because action/obs
            # processers return [time, env, ...] shaped arrays
            FEATURE_KEYS.advantage:
            n_step_advantage.transpose(),
            FEATURE_KEYS.value_target:
            (n_step_advantage + mb_values[:, :-1]).transpose(
            )  # if you add to the advantage the value you get the target for your value function training. Check onenote in APL-virtual
        }
        #(MINE) Probably we combine all experiences from every worker below
        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        full_input = {
            k: combine_first_dimensions(v)
            for k, v in full_input.items()
        }

        if not self.do_training:
            pass
        elif self.agent.mode == ACMode.A2C:
            self.agent.train(full_input)
        elif self.agent.mode == ACMode.PPO:
            for epoch in range(self.ppo_par.n_epochs):
                self._train_ppo_epoch(full_input)
            self.agent.update_theta()

        self.latest_obs = latest_obs
        self.batch_counter += 1  # It is used only for printing reasons as the outer while loop takes care to stop the number of batches
        print('Batch %d finished' % self.batch_counter)
        sys.stdout.flush()
Ejemplo n.º 6
0
    def run_batch(self):
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1),
                             dtype=np.float32)
        mb_rewards = np.zeros((self.envs.n_envs, self.n_steps),
                              dtype=np.float32)
        self.sprint.print(
            ["values,rewards=", mb_values.shape, mb_rewards.shape])

        latest_obs = self.latest_obs

        for n in range(self.n_steps):
            # could calculate value estimate from obs when do training
            # but saving values here will make n step reward calculation a bit easier
            action_ids, spatial_action_2ds, value_estimate = self.agent.step(
                latest_obs)

            mb_values[:, n] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids, spatial_action_2ds))

            actions_pp = self.action_processer.process(action_ids,
                                                       spatial_action_2ds)
            self.aprint.print(
                ["test act_print", action_ids, spatial_action_2ds, actions_pp])
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processer.process(obs_raw)
            self.oprint.print([
                "test obs_print", [(k, v.shape) for k, v in latest_obs.items()]
            ])
            mb_rewards[:, n] = [t.reward for t in obs_raw]

            for t in obs_raw:
                if t.last():
                    self._handle_episode_end(t)

        mb_values[:, -1] = self.agent.get_value(latest_obs)

        n_step_advantage = general_n_step_advantage(
            mb_rewards,
            mb_values,
            self.discount,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0)

        full_input = {
            # these are transposed because action/obs
            # processers return [time, env, ...] shaped arrays
            FEATURE_KEYS.advantage:
            n_step_advantage.transpose(),
            FEATURE_KEYS.value_target:
            (n_step_advantage + mb_values[:, :-1]).transpose()
        }
        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        full_input = {
            k: combine_first_dimensions(v)
            for k, v in full_input.items()
        }
        self.batch_print.print(
            ["full_input=", [(k, v.shape) for (k, v) in full_input.items()]])

        if not self.do_training:
            pass
        elif self.agent.mode == ACMode.A2C:
            self.agent.train(full_input)
        elif self.agent.mode == ACMode.PPO:
            for epoch in range(self.ppo_par.n_epochs):
                self._train_ppo_epoch(full_input)
            self.agent.update_theta()

        self.latest_obs = latest_obs
        self.batch_counter += 1
        sys.stdout.flush()
Ejemplo n.º 7
0
    def run_batch(self):
        """run_batch

        Run a batch of the training, building up a list of actions, observations,
        values of those actions and the rewards given.
        """
        if self.number_episodes == self.episode_counter:
            print("Max number episodes reached. Quitting.")
            return False
        # Define variables to store the actions, observations, values and rewards in.
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.n_envs, self.n_steps + 1), dtype=np.float32)
        mb_rewards = np.zeros((self.envs.n_envs, self.n_steps), dtype=np.float32)

        latest_obs = self.latest_obs

        # For the number of steps, save the relevant data each step.
        # When finished, deal with the episode end.
        for n_step in range(self.n_steps):
            # Save the value estimate here, to make the n step reward calculation easier.
            action_id, spatial_action_2d, value_estimate = self.agent.step(latest_obs)

            mb_values[:, n_step] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_id, spatial_action_2d))

            actions_pp = self.action_processor.process(action_id, spatial_action_2d)
            obs_raw = self.envs.step(actions_pp)
            latest_obs = self.obs_processor.process(obs_raw)
            mb_rewards[:, n_step] = [t.reward for t in obs_raw]

            for timestep in obs_raw:
                if timestep.last():
                    self._handle_episode_end(timestep)

        mb_values[:, -1] = self.agent.get_value(latest_obs)

        n_step_advantage = general_n_step_advantage(
            mb_rewards,
            mb_values,
            self.discount,
            lambda_par=1.0
        )

        full_input = {
            # These are transposed because action/obs
            # processors return [time, env, ...] shaped arrays.
            FEATURE_KEYS.advantage: n_step_advantage.transpose(),
            FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose()
        }

        full_input.update(self.action_processor.combine_batch(mb_actions))
        full_input.update(self.obs_processor.combine_batch(mb_obs))
        full_input = {k: combine_first_dimensions(v) for k, v in full_input.items()}

        if not self.do_training:
            pass
        else:
            self.agent.train(full_input)

        self.latest_obs = latest_obs
        self.batch_counter += 1

        sys.stdout.flush()
        return True
Ejemplo n.º 8
0
    def run_meta_batch(self):
        mb_actions = []
        mb_obs = []
        mb_rnns = []
        mb_values = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32)
        mb_rewards = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32) # n x d array (ndarray)
        mb_done = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.int32)

        # EVERYTHING IS HAPPENING ON PARALLEL!!!
        r_=np.zeros((self.envs.num_envs, 1), dtype=np.float32) # Instead of 1 you might use n_steps
        a_=np.zeros((self.envs.num_envs), dtype=np.int32)
        latest_obs = self.latest_obs # (MINE) =state(t)
        # rnn_state = self.agent.theta.state_init
        rnn_state = self.agent.theta.state_init
        for n in range(self.n_steps):
            action_ids, value_estimate, rnn_state_new = self.agent.step_recurrent(latest_obs, rnn_state, r_, a_) # Automatically returns [num_envs, outx] for each outx you want
            # print('|step:', n, '|actions:', action_ids)
            # (MINE) Store actions and value estimates for all steps
            mb_values[:, n] = value_estimate
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids))
            # (MINE)  do action, return it to environment, get new obs and reward, store reward
            obs_raw = self.envs.step(action_ids)
            latest_obs = self.obs_processer.process(obs_raw[0]) # For obs_raw as tuple! #(MINE) =state(t+1). Processes all inputs/obs from all timesteps (and envs)

            mb_rnns.append(rnn_state)
            rnn_state = rnn_state_new
            r_ = obs_raw[1] # (nenvs,) but you need (nenvs,1)
            r_ = np.reshape(r_,[self.envs.num_envs,1]) # gets into recurrency as [nenvs,1] # The 1 might be used as timestep
            a_ = action_ids

            mb_rewards[:, n] = [t for t in obs_raw[1]]
            mb_done[:, n] = [t for t in obs_raw[2]]

            # Shouldnt this part below be OUT of the nstep loop? NO: You check if done=True and you extract the additional info that Monitor outputs
            indx=0 # env count
            for t in obs_raw[2]: # Monitor returns additional stuff such as epis_reward and epis_length etc apart the obs, r, done, info
                # obs_raw[2] = done = [True, False, False, True,...] each element corresponds to an env (index gives the env)
                if t == True: # done=true
                    # Put reward in scores
                    epis_reward = obs_raw[3][indx]['episode']['r']
                    epis_length = obs_raw[3][indx]['episode']['l'] # EPISODE LENGTH HERE!!!
                    last_step_r = obs_raw[1][indx]
                    self._handle_episode_end(epis_reward, epis_length, last_step_r) # The printing score process is NOT a parallel process apparrently as you input every reward (t) independently
                    # Here you have to reset the rnn_state of that env (rnn_state has h and c EACH has dims [batch_size x hidden dims]
                    rnn_state[0][indx] = np.zeros(256)
                    rnn_state[1][indx] = np.zeros(256)
                    #reset the relevant r_ and a_
                    r_[indx] = 0
                    a_[indx] = 0

                indx = indx + 1 # finished envs count
        # Below: if all are zeros then you get nsteps
        mb_l = self.first_nonzero(mb_done,axis=1) # the last obs s-->sdone are not getting in the training!!!This is because sdone--> ? and R=0
        # Substitute 0s with 1 declaring 1 step
        # mb_l[mb_l==0] = 1
        for b in range(mb_l.shape[0]):
            if mb_l[b] < self.n_steps:
                mb_l[b] = mb_l[b] + 1 # You start stepping from step 0 to 1
        mask = ~(np.ones(mb_rewards.shape).cumsum(axis=1).T > mb_l).T
        mb_rewards = mb_rewards*mask
        # Below: r + gammaV(s')(1-done) - V(s)// V(s')=mb_values[:,-1] but if its done we dont care if we put the last nstep V or the actual V(s_done+1) as the whole term is gonna be zero
        # From V(nstep) estimate the expected reward - what if though we finished the sequence earlier???
        # This is for the last obs which you do not store. All other values that will be used as targets are available
        # mb_values[:, -1] = self.agent.get_recurrent_value(latest_obs, rnn_state, r_, a_) # Put at last slot the estimated future expected reward for bootstrap the V after the nsteps
        mask_v = ~(np.ones(mb_values.shape).cumsum(axis=1).T > mb_l).T
        mb_values = mb_values*mask_v
        # We take the vector of values after nsteps and we use ONLY the valid ones in the loop below
        vec = self.agent.get_recurrent_value(latest_obs, rnn_state, r_, a_)
        for b in range(mb_l.shape[0]):
            if mb_l[b] == self.n_steps:
                mb_values[b, -1] = vec[b] # if the sequence didnt end within nsteps then we add the value so the term gammaVt+1(1-done) is not 0
        # Mask below the values that enter the nstep advantage
        n_step_advantage = general_nstep_adv_sequential(
            mb_rewards,
            mb_values,
            self.discount,
            mb_done,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0,
            nenvs=self.n_envs,
            maxsteps=self.n_steps
        )
        # n_step_advantage = general_n_step_advantage(
        #     mb_rewards,
        #     mb_values,
        #     self.discount,
        #     mb_done,
        #     lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0
        # )
        # prev_rewards = [0] + mb_rewards[:, :-1]#.tolist() # from the rewards you take out the last element and replace it with 0
        prev_rewards = np.c_[np.zeros((self.envs.num_envs, 1), dtype=np.float32), mb_rewards[:, :-1]]
        # Below we add one zero action element and we take out the a_t so we get at=0:t-1
        prev_actions = [np.zeros((self.envs.num_envs), dtype=np.int32)] + mb_actions[:-1] # You have to pad this probably to have equal lengths with your data in terms of nsteps
        full_input = {
            FEATURE_KEYS.advantage: n_step_advantage.transpose(),
            FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose() # NETWORK TARGET (A+V=R)
        }

        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        # full_input.update(self.action_processer.combine_batch(prev_actions)) # THIS FEEDS AGAIN THE SELECTED_ID_ACTION PLACEHOLDER!!!!
        # full_input.update(self.action_processer.combine_batch(prev_rewards)) # THIS FEEDS AGAIN THE SELECTED_ID_ACTION PLACEHOLDER!!!!
        # Below: [maxsteps x batch] --> [batch x maxsteps]
        full_input = {k: np.swapaxes(v, 0, 1) for k, v in full_input.items()} # obs should be first trasnposed and then combine else you loose the time order
        full_input = {k: combine_first_dimensions(v) for k, v in full_input.items()} # YOU CAN COMMENT THIS AND UNCOMMENT BELOW
        # full_input = {k: np.swapaxes(v,0,1) for k, v in full_input.items()}

        if not self.do_training:
            pass
        elif self.agent.mode == ACMode.A2C:
            if self.policy_type == MetaPolicy:
                self.agent.train_recurrent(full_input,mb_l,prev_rewards,prev_actions)
            else:
                self.agent.train(full_input)
        elif self.agent.mode == ACMode.PPO:
            for epoch in range(self.ppo_par.n_epochs):
                self._train_ppo_epoch(full_input)
            self.agent.update_theta()

        self.latest_obs = latest_obs
        self.batch_counter += 1
        print('Batch %d finished' % self.batch_counter)
        sys.stdout.flush()
Ejemplo n.º 9
0
    def run_factored_batch(self):
        #(MINE) MAIN LOOP!!!
        # The reset is happening through Monitor (except the first one of the first batch (is in hte run_agent)
        mb_actions = []
        mb_obs = []
        mb_values = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32)
        mb_values_goal = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32)
        mb_values_fire = np.zeros((self.envs.num_envs, self.n_steps + 1), dtype=np.float32)
        mb_rewards = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32)
        mb_rewards_goal = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32)
        mb_rewards_fire = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.float32)
        mb_done = np.zeros((self.envs.num_envs, self.n_steps), dtype=np.int32)

        latest_obs = self.latest_obs # (MINE) =state(t)
        for n in range(self.n_steps):
            # could calculate value estimate from obs when do training
            # but saving values here will make n step reward calculation a bit easier
            action_ids, value_estimate_goal, value_estimate_fire, value_estimate = self.agent.step_factored(latest_obs)
            # print('|step:', n, '|actions:', action_ids)  # (MINE) If you put it after the envs.step the SUCCESS appears at the envs.step so it will appear oddly
            if np.random.random() < 0.20: action_ids = np.array([self.envs.action_space.sample() for _ in range(self.envs.num_envs)])
            # (MINE) Store actions and value estimates for all steps
            mb_values[:, n] = value_estimate
            mb_values_goal[:, n] = value_estimate_goal
            mb_values_fire[:, n] = value_estimate_fire
            mb_obs.append(latest_obs)
            mb_actions.append((action_ids))
            # (MINE)  do action, return it to environment, get new obs and reward, store reward
            #actions_pp = self.action_processer.process(action_ids) # Actions have changed now need to check: BEFORE: actions.FunctionCall(actions.FUNCTIONS.no_op.id, []) NOW: actions.FUNCTIONS.no_op()
            obs_raw = self.envs.step(action_ids)
            #obs_raw.reward = reward
            latest_obs = self.obs_processer.process(obs_raw[0]) # For obs_raw as tuple! #(MINE) =state(t+1). Processes all inputs/obs from all timesteps (and envs)
            #print('-->|rewards:', np.round(np.mean(obs_raw[1]), 3))
            mb_rewards[:, n] = [t for t in obs_raw[1]]
            temp = [t for t in obs_raw[3]]
            mb_rewards_goal[:,n] = [d['goal'] for d in temp]
            mb_rewards_fire[:, n] = [d['fire'] for d in temp]
            mb_done[:, n] = [t for t in obs_raw[2]]

            # IF MAX_STEPS OR GOAL REACHED
            if obs_raw[2].any(): # At least one env has done=True
                for i in (np.argwhere(obs_raw[2])): # Run the loop for ONLY the envs that have finished
                    indx = i[0]
                    epis_reward = obs_raw[3][indx]['episode']['r']
                    epis_length = obs_raw[3][indx]['episode']['l']
                    last_step_r = obs_raw[1][indx]
                    self._handle_episode_end(epis_reward, epis_length, last_step_r)

            # indx=0 # env count
            # for t in obs_raw[2]: # Monitor returns additional stuff such as epis_reward and epis_length etc apart the obs, r, done, info
            #     #obs_raw[2] = done = [True, False, False, True,...] each element corresponds to an env
            #     if t == True: # done=true
            #         # Put reward in scores
            #         epis_reward = obs_raw[3][indx]['episode']['r']
            #         epis_length = obs_raw[3][indx]['episode']['l']
            #         last_step_r = obs_raw[1][indx]
            #         self._handle_episode_end(epis_reward, epis_length, last_step_r) # The printing score process is NOT a parallel process apparrently as you input every reward (t) independently
            #     indx = indx + 1 # finished envs count
            # for t in obs_raw:
            #     if t.last():
            #         self._handle_episode_end(t)

        #print(">> Avg. Reward:",np.round(np.mean(mb_rewards),3))
        mb_values_goal[:, -1], mb_values_fire[:, -1], mb_values[:, -1] = self.agent.get_factored_value(latest_obs) # We bootstrap from last step. If not terminal! although he doesnt use any check here
        # R, G = calc_rewards(self, mb_rewards, mb_values[:,:-1], mb_values[:,1:], mb_done, self.discount)
        # R = R.reshape((self.n_envs, self.n_steps)) # self.n_envs returns 1, self.envs.num_envs
        # G = G.reshape((self.n_envs, self.n_steps))
        # He replaces the last value with a bootstrap value. E.g. s_t-->V(s_t),s_{lastnstep}-->V(s_{lastnstep}), s_{lastnstep+1}
        # and we replace the V(s_{lastnstep}) with V(s_{lastnstep+1})
        n_step_advantage_goal = general_nstep_adv_sequential(
            mb_rewards_goal,
            mb_values_goal,
            self.discount,
            mb_done,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0,
            nenvs=self.n_envs,
            maxsteps=self.n_steps
        )
        n_step_advantage_fire = general_nstep_adv_sequential(
            mb_rewards_fire,
            mb_values_fire,
            self.discount,
            mb_done,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0,
            nenvs=self.n_envs,
            maxsteps=self.n_steps
        )
        # n_step_advantage = n_step_advantage_goal + n_step_advantage_fire
        n_step_advantage = general_nstep_adv_sequential(
            mb_rewards,
            mb_values,
            self.discount,
            mb_done,
            lambda_par=self.ppo_par.lambda_par if self.is_ppo else 1.0,
            nenvs=self.n_envs,
            maxsteps=self.n_steps
        )
        # disc_ret = calculate_n_step_reward(
        #     mb_rewards,
        #     self.discount,
        #     mb_values,
        # )
        full_input = {
            # these are transposed because action/obs
            # processers return [time, env, ...] shaped arrays CHECK THIS OUT!!!YOU HAVE ALREADY THE TIMESTEP DIM!!!
            FEATURE_KEYS.advantage: n_step_advantage.transpose(),#, G.transpose()
            FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose(),
            FEATURE_KEYS.value_target_goal: (n_step_advantage_goal + mb_values_goal[:, :-1]).transpose(),# R.transpose()# (we left out the last element) if you add to the advantage the value you get the target for your value function training. Check onenote in cmu_gym/Next Steps
            FEATURE_KEYS.value_target_fire: (n_step_advantage_fire + mb_values_fire[:, :-1]).transpose()
        }
        # full_input = {
        #     # these are transposed because action/obs
        #     # processers return [time, env, ...] shaped arrays CHECK THIS OUT!!!YOU HAVE ALREADY THE TIMESTEP DIM!!!
        #     FEATURE_KEYS.advantage: n_step_advantage.transpose(),#, G.transpose()
        #     FEATURE_KEYS.value_target: (n_step_advantage + mb_values[:, :-1]).transpose()# R.transpose()# (we left out the last element) if you add to the advantage the value you get the target for your value function training. Check onenote in cmu_gym/Next Steps
        # }
        #(MINE) Probably we combine all experiences from every worker below
        full_input.update(self.action_processer.combine_batch(mb_actions))
        full_input.update(self.obs_processer.combine_batch(mb_obs))
        # Below comment it out for LSTM
        full_input = {k: combine_first_dimensions(v) for k, v in full_input.items()} # The function takes in nsteps x nenvs x [dims] and combines nsteps*nenvs x [dims].

        if not self.do_training:
            pass
        elif self.agent.mode == ACMode.A2C:
            self.agent.train(full_input)
        elif self.agent.mode == ACMode.PPO:
            for epoch in range(self.ppo_par.n_epochs):
                self._train_ppo_epoch(full_input)
            self.agent.update_theta()

        self.latest_obs = latest_obs
        self.batch_counter += 1 # It is used only for printing reasons as the outer while loop takes care to stop the number of batches
        print('Batch %d finished' % self.batch_counter)
        sys.stdout.flush()