def test_Playground_continuous_Hard_Lunar():
    play = bloc.GymPlayground('LunarLanderContinuous-v2',
                              harderEnvCoeficient=1.5)
    assert play.ACTION_SPACE.shape == (2, )
    assert play.ACTION_CHOICES == 2
    assert play.OBSERVATION_SPACE.shape == (8, )
    assert play.OBSERVATION_DIM == 8
def gym_discrete_setup():
    """
    :return: (exp_spec, playground)
    :rtype: (ExperimentSpec, GymPlayground)
    """
    exp_spec = bloc.ExperimentSpec(batch_size_in_ts=1000,
                                   max_epoch=2,
                                   theta_nn_hidden_layer_topology=(2, 2))
    playground = bloc.GymPlayground('LunarLander-v2')
    yield exp_spec, playground
    tf_cv1.reset_default_graph()
Esempio n. 3
0
def gym_and_tf_discrete_setup():
    """
    :return: (obs_p, act_p, exp_spec, playground)
    :rtype: (tf.Tensor, tf.Tensor, ExperimentSpec, GymPlayground)
    """
    exp_spec = bloc.ExperimentSpec(batch_size_in_ts=1000, max_epoch=2, theta_nn_hidden_layer_topology=(2, 2))
    playground = bloc.GymPlayground('LunarLander-v2')
    obs_p, act_p, Q_values_ph = bloc.gym_playground_to_tensorflow_graph_adapter(playground,
                                                                                action_shape_constraint=(1,))
    yield obs_p, act_p, exp_spec, playground
    tf_cv1.reset_default_graph()
def gym_and_tf_SAC_Brain_continuous_setup():
    """
    :return: obs_t_ph, act_ph, obs_t_prime_ph, reward_t_ph, trj_done_t_ph, exp_spec, playground
    """
    exp_spec = bloc.ExperimentSpec()
    exp_spec.set_experiment_spec(unit_test_hparam)
    
    playground = bloc.GymPlayground('LunarLanderContinuous-v2')
    obs_t_ph, act_ph, _ = bloc.gym_playground_to_tensorflow_graph_adapter(playground)
    obs_t_prime_ph = bloc.continuous_space_placeholder(space=playground.OBSERVATION_SPACE,
                                                       name=vocab.obs_tPrime_ph)
    reward_t_ph = tf_cv1.placeholder(dtype=tf.float32, shape=(None,), name=vocab.rew_ph)
    trj_done_t_ph = tf_cv1.placeholder(dtype=tf.float32, shape=(None,), name=vocab.trj_done_ph)

    yield obs_t_ph, act_ph, obs_t_prime_ph, reward_t_ph, trj_done_t_ph, exp_spec, playground
    tf_cv1.reset_default_graph()
def test_Playground_discreet():
    play = bloc.GymPlayground('LunarLander-v2')
    assert play.ACTION_CHOICES == 4
    assert play.OBSERVATION_DIM == 8
def test_Playground_continuous_Hard_no_env_FAIL():
    with pytest.raises(Exception):
        play = bloc.GymPlayground('Pendulum-v0', harderEnvCoeficient=1.5)
def test_Playground_continuous():
    play = bloc.GymPlayground('LunarLanderContinuous-v2')
    assert play.ACTION_SPACE.shape == (2, )
    assert play.ACTION_CHOICES == 2
    assert play.OBSERVATION_SPACE.shape == (8, )
    assert play.OBSERVATION_DIM == 8
def test_Playground_init_ENV_FAIL():
    with pytest.raises(Exception):
        bloc.GymPlayground('UnExistingEnvironment!!!')
    def _build_computation_graph(self):
        """ Build the Policy_phi, V_psi and Q_theta computation graph as multi-layer perceptron """

        self._set_random_seed()

        # (nice to have) todo:implement --> add init hook:
        # Note: Second environment for policy evaluation
        self.evaluation_playground = bloc.GymPlayground(
            environment_name=self.exp_spec.prefered_environment)
        """ ---- Placeholder ---- """
        self.obs_t_ph = bloc.build_observation_placeholder(self.playground,
                                                           name=vocab.obs_t_ph)
        self.obs_t_prime_ph = bloc.build_observation_placeholder(
            self.playground, name=vocab.obs_tPrime_ph)
        self.act_ph = bloc.build_action_placeholder(self.playground,
                                                    name=vocab.act_ph)

        self.reward_t_ph = tf_cv1.placeholder(dtype=tf.float32,
                                              shape=(None, ),
                                              name=vocab.rew_ph)
        self.trj_done_t_ph = tf_cv1.placeholder(dtype=tf.float32,
                                                shape=(None, ),
                                                name=vocab.trj_done_ph)

        # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
        # /// Actor computation graph //////////////////////////////////////////////////////////////////////////////////
        with tf_cv1.variable_scope(vocab.actor_network):

            pi, pi_log_p, self.policy_mu = build_gaussian_policy_graph(
                self.obs_t_ph, self.exp_spec, self.playground)

            self.policy_pi, self.pi_log_likelihood = apply_action_bound(
                pi, pi_log_p)
            """ ---- Adjust policy distribution result to action range  ---- """
            if self.playground.ACTION_SPACE.bounded_above.all():
                self.policy_pi *= self.playground.ACTION_SPACE.high[0]
                self.policy_mu *= self.playground.ACTION_SPACE.high[0]

        # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
        # /// Critic computation graph /////////////////////////////////////////////////////////////////////////////////
        with tf_cv1.variable_scope(vocab.critic_network):
            self.V_psi, self.V_psi_frozen = build_critic_graph_v_psi(
                self.obs_t_ph, self.obs_t_prime_ph, self.exp_spec)
            """ ---- Q_theta {1,2} according to sampled action & according to the reparametrized policy---- """
            self.Q_act_1, self.Q_pi_1 = build_critic_graph_q_theta(
                self.obs_t_ph,
                self.act_ph,
                self.policy_pi,
                self.exp_spec,
                name=vocab.Q_theta_1)
            self.Q_act_2, self.Q_pi_2 = build_critic_graph_q_theta(
                self.obs_t_ph,
                self.act_ph,
                self.policy_pi,
                self.exp_spec,
                name=vocab.Q_theta_2)

        # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
        # /// Actor & Critic Training ops //////////////////////////////////////////////////////////////////////////////
        with tf_cv1.variable_scope(vocab.critic_training):
            critic_lr_schedule, critic_global_grad_step = critic_learning_rate_scheduler(
                self.exp_spec)

            self.V_psi_loss, self.V_psi_optimizer = critic_v_psi_train(
                self.V_psi, self.Q_pi_1, self.Q_pi_2, self.pi_log_likelihood,
                self.exp_spec, critic_lr_schedule, critic_global_grad_step)

            q_theta_train_ops = critic_q_theta_train(
                self.V_psi_frozen, self.Q_act_1, self.Q_act_2,
                self.reward_t_ph, self.trj_done_t_ph, self.exp_spec,
                critic_lr_schedule, critic_global_grad_step)

        self.q_theta_1_loss, self.q_theta_2_loss, self.q_theta_1_optimizer, self.q_theta_2_optimizer = q_theta_train_ops

        with tf_cv1.variable_scope(vocab.policy_training):
            self.actor_kl_loss, self.actor_policy_optimizer_op = actor_train(
                self.pi_log_likelihood, self.Q_pi_1, self.Q_pi_2,
                self.exp_spec)
        """ ---- Target nework update: V_psi --> frozen_V_psi ---- """
        with tf_cv1.variable_scope(vocab.target_update):
            self.V_psi_frozen_update_ops = update_frozen_v_psi_op(
                self.exp_spec['target_smoothing_coefficient'])
            self.init_frozen_v_psi_op = init_frozen_v_psi()

        tr_str = list_representation(
            tf_cv1.get_collection_ref(tf_cv1.GraphKeys.TRAINABLE_VARIABLES),
            ":: TRAINABLE_VARIABLES")
        print(tr_str)

        # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\
        # /// Summary ops //////////////////////////////////////////////////////////////////////////////////////////////

        # region :: Summary placholders & ops ...
        """ ---- By Epoch summary: RETURNS & LENGHT ---- """
        self.summary_avg_trjs_return_ph = tf_cv1.placeholder(
            tf.float32,
            name=vocab.summary_ph + 'stoPi_stage_avg_trjs_return_ph')
        tf_cv1.summary.scalar('Epoch_average_trj_return_stochastic_pi)',
                              self.summary_avg_trjs_return_ph,
                              family=vocab.G)

        self.summary_avg_trjs_len_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'stoPi_stage_avg_trjs_len_ph')
        tf_cv1.summary.scalar('Epoch_average_trj_lenght_stochastic_pi)',
                              self.summary_avg_trjs_len_ph,
                              family=vocab.Trajectory_lenght)

        self.summary_eval_avg_trjs_return_ph = tf_cv1.placeholder(
            tf.float32,
            name=vocab.summary_ph + 'detPi_stage_avg_trjs_return_ph')
        tf_cv1.summary.scalar('Epoch_average_trj_return_deterministic_pi)',
                              self.summary_eval_avg_trjs_return_ph,
                              family=vocab.G)

        self.summary_eval_avg_trjs_len_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'detPi_stage_avg_trjs_len_ph')
        tf_cv1.summary.scalar('Epoch_average_trj_lenght_deterministic_pi)',
                              self.summary_eval_avg_trjs_len_ph,
                              family=vocab.Trajectory_lenght)
        """ ---- By Epoch summary: LOSS ---- """
        self.summary_avg_trjs_Vloss_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'Critic_V_loss_ph')
        tf_cv1.summary.scalar('critic_v_loss',
                              self.summary_avg_trjs_Vloss_ph,
                              family=vocab.loss)

        self.summary_avg_trjs_Q1loss_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'Critic_Q1_loss_ph')
        tf_cv1.summary.scalar('critic_q_1_loss',
                              self.summary_avg_trjs_Q1loss_ph,
                              family=vocab.loss)

        self.summary_avg_trjs_Q2loss_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'Critic_Q2_loss_ph')
        tf_cv1.summary.scalar('critic_q_2_loss',
                              self.summary_avg_trjs_Q2loss_ph,
                              family=vocab.loss)

        self.summary_avg_trjs_pi_loss_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'policy_loss_ph')
        tf_cv1.summary.scalar('policy_loss',
                              self.summary_avg_trjs_pi_loss_ph,
                              family=vocab.loss)
        """ ---- By Epoch summary: POLICY & VALUE fct ---- """

        self.summary_avg_pi_log_likelihood_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'pi_log_p_ph')
        tf_cv1.summary.scalar('policy_log_likelihood',
                              self.summary_avg_pi_log_likelihood_ph,
                              family=vocab.policy)

        # self.summary_avg_policy_pi_ph = tf_cv1.placeholder(tf.float32, name=vocab.summary_ph + 'policy_pi_ph')
        # tf_cv1.summary.scalar('policy_py', self.summary_avg_policy_pi_ph, family=vocab.policy)
        #
        # self.summary_avg_policy_mu_ph = tf_cv1.placeholder(tf.float32, name=vocab.summary_ph + 'policy_mu_ph')
        # tf_cv1.summary.scalar('policy_mu', self.summary_avg_policy_mu_ph, family=vocab.policy)

        self.summary_avg_V_value_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'V_values_ph')
        tf_cv1.summary.scalar('V_values',
                              self.summary_avg_V_value_ph,
                              family=vocab.values)

        self.summary_avg_frozen_V_value_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'frozen_V_values_ph')
        tf_cv1.summary.scalar('frozen_V_values',
                              self.summary_avg_frozen_V_value_ph,
                              family=vocab.values)

        self.summary_avg_Q1_value_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'Q1_values_ph')
        tf_cv1.summary.scalar('Q1_values',
                              self.summary_avg_Q1_value_ph,
                              family=vocab.values)

        self.summary_avg_Q2_value_ph = tf_cv1.placeholder(
            tf.float32, name=vocab.summary_ph + 'Q2_values_ph')
        tf_cv1.summary.scalar('Q2_values',
                              self.summary_avg_Q2_value_ph,
                              family=vocab.values)

        self.summary_epoch_op = tf_cv1.summary.merge_all()
        """ ---- Distribution summary ---- """
        self.summary_hist_policy_pi = tf_cv1.summary.histogram(
            'policy_py_tensor', self.policy_pi, family=vocab.policy)
        """ ---- By Trajectory summary ---- """
        # self.summary_sto_pi_TRJ_return_ph = tf_cv1.placeholder(tf.float32,
        #                                                        name=vocab.summary_ph + 'summary_stoPi_trj_return_ph')
        # self.summary_sto_pi_TRJ_return_op = tf_cv1.summary.scalar('Trajectory_return_stochastic_pi',
        #                                                           self.summary_sto_pi_TRJ_return_ph, family=vocab.G)
        #
        # self.summary_sto_pi_TRJ_lenght_ph = tf_cv1.placeholder(tf.float32,
        #                                                        name=vocab.summary_ph + 'summary_stoPi_trj_lenght_ph')
        # self.summary_sto_pi_TRJ_lenght_op = tf_cv1.summary.scalar('Trajectory_lenght_stochastic_pi',
        #                                                           self.summary_sto_pi_TRJ_lenght_ph,
        #                                                           family=vocab.Trajectory_lenght)
        #
        # self.summary_TRJ_op = tf_cv1.summary.merge([self.summary_sto_pi_TRJ_return_op,
        #                                             self.summary_sto_pi_TRJ_lenght_op])

        # endregion
        return None
def train(env_name='CartPole-v0',
          hidden_sizes=[32],
          lr=1e-2,
          epochs=50,
          batch_size=5000,
          render=False):

    # make environment, check spaces, get obs / act dims
    # env = gym.make(env_name)                                                             # ////// Original bloc //////

    REINFORCE_integration_test = {                                                         # \\\\\\    My bloc    \\\\\\
        'prefered_environment': env_name,
        'paramameter_set_name': 'REINFORCE integration test on CartPole-v0',
        'batch_size_in_ts': batch_size,
        'max_epoch': epochs,
        'discounted_reward_to_go': False,
        'discout_factor': 0.999,
        'learning_rate': lr,
        'theta_nn_h_layer_topo': tuple(hidden_sizes),
        'random_seed': 42,
        'theta_hidden_layers_activation': tf.nn.tanh,  # tf.nn.relu,
        'theta_output_layers_activation': None,
        'render_env_every_What_epoch': 100,
        'print_metric_every_what_epoch': 5,
    }
    playground = BLOC.GymPlayground(env_name)  # \\\\\\    My bloc    \\\\\\
    env = playground.env  # \\\\\\    My bloc    \\\\\\
    exp_spec = BLOC.ExperimentSpec()  # \\\\\\    My bloc    \\\\\\
    exp_spec.set_experiment_spec(
        REINFORCE_integration_test)  # \\\\\\    My bloc    \\\\\\
    consol_print_learning_stats = ConsolPrintLearningStats(  # \\\\\\    My bloc    \\\\\\
        exp_spec,
        exp_spec.print_metric_every_what_epoch)  # \\\\\\    My bloc    \\\\\\

    assert isinstance(env.observation_space, Box), \
        "This example only works for envs with continuous state spaces."
    assert isinstance(env.action_space, Discrete), \
        "This example only works for envs with discrete action spaces."

    obs_dim = env.observation_space.shape[0]
    n_acts = env.action_space.n

    # make core of policy network
    # obs_ph = tf.placeholder(shape=(None, obs_dim), dtype=tf.float32)                          # ////// Original bloc //////
    obs_ph, act_ph, weights_ph = BLOC.gym_playground_to_tensorflow_graph_adapter(
        playground)  # \\\\\\    My bloc    \\\\\\

    # logits = mlp(obs_ph, sizes=hidden_sizes+[n_acts])                                    # ////// Original bloc //////
    # logits = BLOC.build_MLP_computation_graph(obs_ph, playground,                        # \\\\\\    My bloc    \\\\\\
    #                                           hidden_layer_topology=tuple(hidden_sizes)) # \\\\\\    My bloc    \\\\\\

    # make action selection op (outputs int actions, sampled from policy)
    # actions = tf.squeeze(tf.multinomial(logits=logits,num_samples=1), axis=1)            # ////// Original bloc //////
    # actions, log_p_all = BLOC.policy_theta_discrete_space(logits, playground)            # \\\\\\    My bloc    \\\\\\

    # make loss function whose gradient, for the right data, is policy gradient
    # weights_ph = tf.placeholder(shape=(None,), dtype=tf.float32)                         # ////// Original bloc //////
    # act_ph = tf.placeholder(shape=(None,), dtype=tf.int32)                               # ////// Original bloc //////
    # action_masks = tf.one_hot(act_ph, n_acts)                                            # ////// Original bloc //////
    # log_probs = tf.reduce_sum(action_masks * tf.nn.log_softmax(logits), axis=1)          # ////// Original bloc //////
    # loss = -tf.reduce_mean(weights_ph * log_probs)                                       # ////// Original bloc //////

    # (!) First silent error cause by uneven batch size                                    # \\\\\\    My bloc    \\\\\\
    # loss = BLOC.discrete_pseudo_loss(log_p_all, act_ph, weights_ph, playground)          # \\\\\\    My bloc    \\\\\\

    reinforce_policy = REINFORCEbrain.REINFORCE_policy(
        obs_ph,
        act_ph,  # \\\\\\    My bloc    \\\\\\
        weights_ph,
        exp_spec,
        playground)  # \\\\\\    My bloc    \\\\\\
    (actions, _, loss) = reinforce_policy  # \\\\\\    My bloc    \\\\\\

    # make train op
    # train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)                   # ////// Original bloc //////
    train_op = BLOC.policy_optimizer(
        loss,
        learning_rate=exp_spec.learning_rate)  # \\\\\\    My bloc    \\\\\\

    # \\\\\\    My bloc    \\\\\\
    date_now = datetime.now()
    run_str = "Run--{}h{}--{}-{}-{}".format(date_now.hour, date_now.minute,
                                            date_now.day, date_now.month,
                                            date_now.year)
    # writer = tf_cv1.summary.FileWriter("./graph/{}".format(run_str), tf_cv1.get_default_graph())
    writer = tf_cv1.summary.FileWriter(
        "test_Z_integration/test_integrationREINFORCE/graph/{}".format(
            run_str), tf_cv1.get_default_graph())

    the_TRAJECTORY_COLLECTOR = TrajectoryCollector(
        exp_spec, playground)  # \\\\\\    My bloc    \\\\\\
    the_UNI_BATCH_COLLECTOR = UniformBatchCollector(
        exp_spec.batch_size_in_ts)  # \\\\\\    My bloc    \\\\\\

    # ////// Original bloc //////
    # sess = tf.InteractiveSession()
    # sess.run(tf.global_variables_initializer())

    # \\\\\\    My bloc    \\\\\\
    tf_cv1.set_random_seed(exp_spec.random_seed)
    np.random.seed(exp_spec.random_seed)
    with tf_cv1.Session() as sess:
        sess.run(tf_cv1.global_variables_initializer()
                 )  # initialize random variable in the computation graph
        consol_print_learning_stats.start_the_crazy_experiment()

        # for training policy
        def train_one_epoch():
            consol_print_learning_stats.next_glorious_epoch(
            )  # \\\\\\    My bloc    \\\\\\

            # ////// Original bloc //////
            # # make some empty lists for logging.
            # batch_obs = []          # for observations
            # batch_acts = []         # for actions
            # batch_weights = []      # for reward-to-go weighting in policy gradient
            # batch_rets = []         # for measuring episode returns
            # batch_lens = []         # for measuring episode lengths
            # ep_rews = []            # list for rewards accrued throughout ep

            # reset episode-specific variables
            obs = env.reset()  # first obs comes from starting distribution
            done = False  # signal from environment that episode is over

            # render first episode of each epoch
            finished_rendering_this_epoch = False

            consol_print_learning_stats.next_glorious_trajectory(
            )  # \\\\\\    My bloc    \\\\\\

            # collect experience by acting in the environment with current policy
            while True:

                # rendering
                if (not finished_rendering_this_epoch) and render:
                    env.render()

                # save obs
                # batch_obs.append(obs.copy())  # <-- (!) (Critical) append S_t not S_{t+1} ////// Original bloc //////

                # # act in the environment
                # act = sess.run(actions, {obs_ph: obs.reshape(1,-1)})[0]                # ////// Original bloc //////
                # obs, rew, done, _ = env.step(act)                                      # ////// Original bloc //////

                step_observation = BLOC.format_single_step_observation(
                    obs)  # \\\\\\    My bloc    \\\\\\
                action_array = sess.run(actions,
                                        feed_dict={
                                            obs_ph: step_observation
                                        })  # \\\\\\    My bloc    \\\\\\
                act = blocAndTools.tensorflowbloc.to_scalar(
                    action_array)  # \\\\\\    My bloc    \\\\\\
                # obs, rew, done, _ = playground.env.step(act)   <-- (!) mistake         # \\\\\\    My bloc    \\\\\\
                # (!) Solution to silent error 2: dont ovewrite S_t                        \\\\\\    My bloc    \\\\\\
                obs_prime, rew, done, _ = playground.env.step(
                    act)  # <-- (!) Solution     \\\\\\    My bloc    \\\\\\

                # ////// Original bloc //////
                # # save action, reward
                # batch_acts.append(act)
                # ep_rews.append(rew)

                # (Critical) | Append the observation S_t that trigered the action A_t is critical.  \\\\\\    My bloc    \\\\\\
                #            | If the observation is the one at time S_{t+1}, the agent wont learn   \\\\\\    My bloc    \\\\\\
                the_TRAJECTORY_COLLECTOR.collect_OAR(
                    obs, act, rew
                )  # <-- (!) Silent error 2            \\\\\\    My bloc    \\\\\\
                obs = obs_prime  # <-- (!) Solution to silent error 2 \\\\\\    My bloc    \\\\\\

                if done:
                    # ////// Original bloc //////
                    # # if episode is over, record info about episode
                    # ep_ret, ep_len = sum(ep_rews), len(ep_rews)
                    # batch_rets.append(ep_ret)
                    # batch_lens.append(ep_len)

                    trj_return = the_TRAJECTORY_COLLECTOR.trajectory_ended(
                    )  # \\\\\\    My bloc    \\\\\\
                    the_TRAJECTORY_COLLECTOR.compute_Qvalues_as_rewardToGo()
                    trj_container = the_TRAJECTORY_COLLECTOR.pop_trajectory_and_reset(
                    )  # \\\\\\    My bloc    \\\\\\
                    the_UNI_BATCH_COLLECTOR.collect(
                        trj_container)  # \\\\\\    My bloc    \\\\\\

                    consol_print_learning_stats.trajectory_training_stat(
                        the_trajectory_return=trj_return,
                        timestep=len(
                            trj_container))  # \\\\\\    My bloc    \\\\\\

                    # the weight for each logprob(a_t|s_t) is reward-to-go from t
                    # batch_weights += list(reward_to_go(ep_rews))                        # ////// Original bloc //////
                    # batch_weights += BLOC.reward_to_go(ep_rews)                        # \\\\\\    My bloc    \\\\\\

                    # reset episode-specific variables
                    obs, done, ep_rews = env.reset(), False, []

                    consol_print_learning_stats.next_glorious_trajectory(
                    )  # \\\\\\    My bloc    \\\\\\

                    # won't render again this epoch
                    finished_rendering_this_epoch = True

                    # ////// Original bloc //////
                    # # end experience loop if we have enough of it
                    # if len(batch_obs) > batch_size:
                    #     break

                    if not the_UNI_BATCH_COLLECTOR.is_not_full(
                    ):  # \\\\\\    My bloc    \\\\\\
                        break  # \\\\\\    My bloc    \\\\\\

            # ////// Original bloc //////
            # # take a single policy gradient update step
            # batch_loss, _ = sess.run([loss, train_op],
            #                          feed_dict={
            #                             obs_ph: np.array(batch_obs),
            #                             act_ph: np.array(batch_acts),
            #                             weights_ph: np.array(batch_weights)
            #                          })

            batch_container = the_UNI_BATCH_COLLECTOR.pop_batch_and_reset(
            )  # \\\\\\    My bloc    \\\\\\
            (batch_rets, batch_lens) = batch_container.get_basic_metric(
            )  # \\\\\\    My bloc    \\\\\\
            batch_obs = batch_container.batch_observations  # \\\\\\    My bloc    \\\\\\
            batch_acts = batch_container.batch_actions  # \\\\\\    My bloc    \\\\\\
            batch_weights = batch_container.batch_Qvalues  # \\\\\\    My bloc    \\\\\\

            feed_dictionary = blocAndTools.tensorflowbloc.build_feed_dictionary(
                [obs_ph, act_ph, weights_ph],
                # \\\\\\    My bloc    \\\\\\
                [
                    batch_obs,
                    # \\\\\\    My bloc    \\\\\\
                    batch_acts,
                    batch_weights
                ])  # \\\\\\    My bloc
            #    \\\\\\
            batch_loss, _ = sess.run(
                [loss, train_op],  # \\\\\\    My bloc    \\\\\\
                feed_dict=feed_dictionary)  # \\\\\\    My bloc    \\\\\\

            return batch_loss, batch_rets, batch_lens

        # training loop
        for i in range(epochs):
            batch_loss, batch_rets, batch_lens = train_one_epoch()
            mean_return = np.mean(batch_rets)
            average_len = np.mean(batch_lens)

            # ////// Original bloc //////
            # print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f' %
            #       (i, batch_loss, mean_return, average_len))

            # \\\\\\    My bloc    \\\\\\
            consol_print_learning_stats.epoch_training_stat(
                epoch_loss=batch_loss,
                epoch_average_trjs_return=mean_return,
                epoch_average_trjs_lenght=average_len,
                number_of_trj_collected=0,
                total_timestep_collected=0)

            yield (i, batch_loss, mean_return, average_len)

    print("\n>>> Close session\n")
    writer.close()
    playground.env.close()
    tf_cv1.reset_default_graph()