Python placeholders Examples

Programming Language: Python

Namespace/Package Name: spinup.algos.sac1.core

Method/Function: placeholders

Examples at hotexamples.com: 4

Python placeholders - 4 examples found. These are the top rated real world Python examples of spinup.algos.sac1.core.placeholders extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def sac1(args,
         env_fn,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(2e6),
         gamma=0.99,
         reward_scale=1.0,
         polyak=0.995,
         lr=5e-4,
         alpha=0.2,
         batch_size=200,
         start_steps=10000,
         max_ep_len_train=1000,
         max_ep_len_test=1000,
         logger_kwargs=dict(),
         save_freq=1):
    """

    Args:
        env_fn : A function which creates a copy of the environment.
            The environment must satisfy the OpenAI Gym API.

        actor_critic: A function which takes in placeholder symbols 
            for state, ``x_ph``, and action, ``a_ph``, and returns the main 
            outputs from the agent's Tensorflow computation graph:

            ===========  ================  ======================================
            Symbol       Shape             Description
            ===========  ================  ======================================
            ``mu``       (batch, act_dim)  | Computes mean actions from policy
                                           | given states.
            ``pi``       (batch, act_dim)  | Samples actions from policy given 
                                           | states.
            ``logp_pi``  (batch,)          | Gives log probability, according to
                                           | the policy, of the action sampled by
                                           | ``pi``. Critical: must be differentiable
                                           | with respect to policy parameters all
                                           | the way through action sampling.
            ``q1``       (batch,)          | Gives one estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q2``       (batch,)          | Gives another estimate of Q* for 
                                           | states in ``x_ph`` and actions in
                                           | ``a_ph``.
            ``q1_pi``    (batch,)          | Gives the composition of ``q1`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q1(x, pi(x)).
            ``q2_pi``    (batch,)          | Gives the composition of ``q2`` and 
                                           | ``pi`` for states in ``x_ph``: 
                                           | q2(x, pi(x)).
            ===========  ================  ======================================

        ac_kwargs (dict): Any kwargs appropriate for the actor_critic 
            function you provided to SAC.

        seed (int): Seed for random number generators.

        steps_per_epoch (int): Number of steps of interaction (state-action pairs) 
            for the agent and the environment in each epoch.

        epochs (int): Number of epochs to run and train agent.

        replay_size (int): Maximum length of replay buffer.

        gamma (float): Discount factor. (Always between 0 and 1.)

        polyak (float): Interpolation factor in polyak averaging for target 
            networks. Target networks are updated towards main networks 
            according to:

            .. math:: \\theta_{\\text{targ}} \\leftarrow 
                \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta

            where :math:`\\rho` is polyak. (Always between 0 and 1, usually 
            close to 1.)

        lr (float): Learning rate (used for policy/value/alpha learning).

        alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to
            inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated.

        batch_size (int): Minibatch size for SGD.

        start_steps (int): Number of steps for uniform-random action selection,
            before running real policy. Helps exploration.

        max_ep_len (int): Maximum length of trajectory / episode / rollout.

        logger_kwargs (dict): Keyword args for EpochLogger.

        save_freq (int): How often (in terms of gap between epochs) to save
            the current policy and value function.

    """
    if not args.is_test:
        logger = EpochLogger(**logger_kwargs)
        logger.save_config(locals())

    tf.set_random_seed(seed)
    np.random.seed(seed)

    env, test_env = env_fn(3), env_fn(1)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim,
                                                      obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic(
            x_ph, x2_ph, a_ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic(
            x2_ph, x2_ph, a_ph, **ac_kwargs)

    # Experience buffer
    replay_buffer = ReplayBuffer(obs_dim=obs_dim,
                                 act_dim=act_dim,
                                 size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n')%var_counts)

    ######
    if alpha == 'auto':
        target_entropy = (-np.prod(env.action_space.shape))

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=0.0)
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha *
                                    tf.stop_gradient(logp_pi + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1,
                                                 name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])


######

# Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2)
    q_backup = r_ph + gamma * (1 - d_ph) * v_backup

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
            tf.identity(alpha), train_pi_op, train_value_op, target_update
        ]
    else:
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op,
            train_value_op, target_update, train_alpha_op
        ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    ##############################  save and restore  ############################

    saver = tf.train.Saver()

    checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints'
    if not os.path.exists(checkpoint_path):
        os.makedirs(checkpoint_path)

    if args.is_test or args.is_restore_train:
        ckpt = tf.train.get_checkpoint_state(checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            print("Model restored.")

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    ##############################  test  ############################

    if args.is_test:
        test_env = gym.make(args.env)
        ave_ep_ret = 0
        for j in range(10000):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not d:  # (d or (ep_len == 2000)):
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
                if args.test_render:
                    test_env.render()
            ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1)
            print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:',
                  ave_ep_ret, '({}/10000)'.format(j + 1))
        return

    ##############################  train  ############################

    def test_agent(n=25):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len_test)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
                # test_env.render()
            logger.store(TestEpRet=ep_ret, TestEpLen=ep_len)

    start_time = time.time()
    o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    total_steps = steps_per_epoch * epochs

    ep_index = 0
    test_ep_ret_best = test_ep_ret = -10000.0

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()

        # Step the env
        o2, r, d, _ = env.step(a)
        ep_ret += r
        ep_len += 1

        # Ignore the "done" signal if it comes from hitting the time
        # horizon (that is, when it's an artificial terminal signal
        # that isn't based on the agent's state)
        # d = False if ep_len==max_ep_len_train else d

        # Store experience to replay buffer
        replay_buffer.store(o, a, r, o2, d)

        # Super critical, easy to overlook step: make sure to update
        # most recent observation!
        o = o2

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len_train):
            ep_index += 1
            print('episode: {}, reward: {}'.format(ep_index,
                                                   ep_ret / reward_scale))
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(int(1.5 * ep_len)):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    a_ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }
                # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
                logger.store(LossPi=outs[0],
                             LossQ1=outs[1],
                             LossQ2=outs[2],
                             Q1Vals=outs[3],
                             Q2Vals=outs[4],
                             LogPi=outs[5],
                             Alpha=outs[6])

            logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch

            test_agent(10)
            # test_ep_ret = logger.get_stats('TestEpRet')[0]
            # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best)
            if logger.get_stats('TestEpRet')[0] >= 280:
                print('Recalculating TestEpRet...')
                test_agent(100)
                test_ep_ret = logger.get_stats('TestEpRet')[0]
                # logger.epoch_dict['TestEpRet'] = []
                if test_ep_ret >= 300:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(ep_index, test_ep_ret))
                    exit()
                print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best)

            # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data
            # Log info about epoch
            logger.log_tabular('Epoch', epoch)
            logger.log_tabular('Num_Ep', ep_index)
            logger.log_tabular('EpRet', with_min_and_max=True)
            logger.log_tabular('TestEpRet', with_min_and_max=False)
            logger.log_tabular('EpLen', average_only=True)
            logger.log_tabular('TestEpLen', average_only=True)
            logger.log_tabular('TotalEnvInteracts', t)
            logger.log_tabular('Alpha', average_only=True)
            logger.log_tabular('Q1Vals', with_min_and_max=True)
            logger.log_tabular('Q2Vals', with_min_and_max=True)
            # logger.log_tabular('VVals', with_min_and_max=True)
            logger.log_tabular('LogPi', with_min_and_max=True)
            logger.log_tabular('LossPi', average_only=True)
            logger.log_tabular('LossQ1', average_only=True)
            logger.log_tabular('LossQ2', average_only=True)
            # logger.log_tabular('LossV', average_only=True)
            logger.log_tabular('Time', time.time() - start_time)
            logger.dump_tabular()

            # Save model
            if ((epoch % save_freq == 0) or
                (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best:
                save_path = saver.save(sess, checkpoint_path + '/model.ckpt',
                                       t)
                print("Model saved in path: %s" % save_path)
                test_ep_ret_best = test_ep_ret

Example #2

Show file

File: sac1.py Project: pootatocakemagic/transfer_learning_rl

def sac1(apr,
         ts_env,
         env_fn,
         replay_buffer,
         name,
         vae=None,
         x_train=None,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(2e6),
         gamma=0.99,
         reward_scale=1.0,
         polyak=0.995,
         lr=5e-4,
         alpha=0.2,
         batch_size=250,
         start_steps=10,
         max_ep_len_train=1000,
         max_ep_len_test=1000,
         logger_kwargs=dict(),
         save_freq=1):
    #   '''
    # def sac1(apr,ts_env, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
    #          steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0,
    #          polyak=0.995, lr=5e-4, alpha=0.2, batch_size=250, start_steps=10000,
    #          max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1):
    #   '''

    # if not apr.is_test:
    # logger = EpochLogger(**logger_kwargs)
    # logger.save_config(locals())
    frames = []
    buffer = []

    tf.set_random_seed(seed)
    np.random.seed(seed)

    print(start_steps)
    epch = 1
    apr.l_ep_ret = -70000
    apr.l_ep_len = 1

    env, test_env = env_fn(3), env_fn(1)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, apr.ph, x2_ph, r_ph, d_ph = core.placeholders(
        obs_dim, act_dim, obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic(
            x_ph, x2_ph, apr.ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic(
            x2_ph, x2_ph, apr.ph, **ac_kwargs)

    # Experience buffer
    # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n') % var_counts)

    ######
    if alpha == 'auto':
        target_entropy = (-np.prod(env.action_space.shape))

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=0.0)
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha *
                                    tf.stop_gradient(logp_pi + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1,
                                                 name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])
    ######

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2)
    q_backup = r_ph + gamma * (1 - d_ph) * v_backup

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
            tf.identity(alpha), train_pi_op, train_value_op, target_update
        ]
    else:
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op,
            train_value_op, target_update, train_alpha_op
        ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    ##############################  save and restore  ############################

    saver = tf.train.Saver()

    #
    # if not os.path.exists(apr.checkpoint_path_r):
    #     os.makedirs(apr.checkpoint_path_r)

    if not os.path.exists(apr.checkpoint_path_wr):
        os.makedirs(apr.checkpoint_path_wr)

    # checkpoint_path_r = apr.checkpoint_path_r

    if apr.is_test or apr.is_restore_train:
        # ckpt = tf.train.get_checkpoint_state(apr.checkpoint_path_wr)
        print("Search ckpt...")
        # if ckpt and ckpt.model_checkpoint_path:
        # saver.restore(sess, ckpt.model_checkpoint_path)
        # print("Model restored.")
        save_path = saver.restore(sess, "content\\model.ckpt")
        print("Model restored in path: %s" % save_path)

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    ##############################  test  ############################

    if apr.is_test:
        # test_env = gym.make(a_env)
        test_env = ts_env
        # test_env = BWg()
        ave_ep_ret = 0
        for j in range(start_steps):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len_test)):
                action = get_action(o, True)
                o, r, d, _ = test_env.step(action)
                ep_ret += r
                ep_len += 1
                if apr.test_render:
                    frames.append(test_env.render(mode='rgb_array'))
                    # test_env.render()
            ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1)
            print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:',
                  ave_ep_ret, '--- {}  /'.format(j + 1), start_steps)
        return

    ##############################  train  ############################

    def test_agent(n=25):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            start_pos = test_env.pos[0]
            pit_x = test_env.pit_x
            stump_x = test_env.stump_x
            stairs_x = test_env.stairs_x
            while not (d or (ep_len == max_ep_len_test)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
                if apr.test_render:
                    frames.append(test_env.render(mode='rgb_array'))
                    # test_env.render()
            finish_pos = test_env.pos[0]
            count_pit = 0
            count_stump = 0
            count_stairs = 0
            for pit in pit_x:
                if start_pos < pit < finish_pos:
                    count_pit += 1
            for stump in stump_x:
                if start_pos < stump < finish_pos:
                    count_stump += 1
            for stair in stairs_x:
                if start_pos < stair < finish_pos:
                    count_stairs += 1
            apr.l_ep_ret = int(ep_ret)
            apr.l_ep_len = ep_len
            # print(apr.l_ep_ret)
            return count_pit, count_stump, count_stairs, finish_pos - start_pos, len(
                pit_x), len(stump_x), len(stairs_x)

    # --------------------------------------------

    start_time = time.time()
    if vae is None and x_train is None:
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    elif vae is not None:
        o, r, d, ep_ret, ep_len = vae[0].get_data()[0], 0, False, 0, 0
    elif x_train is not None:
        count = 0
        data = x_train[count]
        o, ep_ret, ep_len = data[0], 0, 0
    total_steps = steps_per_epoch * epochs

    test_ep_ret = -10000.0
    test_ep_ret_best = apr.bestr

    # Main loop: collect experience in env and update/log each epoch
    for t in range(total_steps):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy. 
        """
        if t > start_steps:
            a = get_action(o)
        else:
            a = env.action_space.sample()
        if vae is None and x_train is None:
            o2, r, d, _ = env.step(a)
            # env.render(mode='rgb_array')
            ep_ret += r
            ep_len += 1
            replay_buffer.store(o, a, r, o2, d)
            buffer += [[o, o2, a, r, d]]
            o = o2
        elif vae is not None:
            o_1, o2_1, a_1, r_1, d_1 = vae.get_data()
            ep_ret += r_1
            ep_len += 1
            replay_buffer.store(o, a, r, o2, d)
        elif x_train is not None:
            data = x_train[count]
            # data = random.choice(x_train)
            o, o2, a, r, d = data
            count += 1
            ep_ret += r
            ep_len += 1
            replay_buffer.store(o, a, r, o2, d)

        # End of episode. Training (ep_len times).
        if d or (ep_len == max_ep_len_train):
            """
            Perform all SAC updates at the end of the trajectory.
            This is a slight difference from the SAC specified in the
            original paper.
            """
            for j in range(ep_len):
                batch = replay_buffer.sample_batch(batch_size)
                feed_dict = {
                    x_ph: batch['obs1'],
                    x2_ph: batch['obs2'],
                    apr.ph: batch['acts'],
                    r_ph: batch['rews'],
                    d_ph: batch['done'],
                }
                # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                outs = sess.run(step_ops, feed_dict)
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

        # End of epoch wrap-up
        if t > 0 and t % steps_per_epoch == 0:
            epoch = t // steps_per_epoch
            count_pit, count_stump, count_stairs, way, len_pit_x, len_stump_x, len_stairs = test_agent(
                1)
            test_ep_ret = apr.l_ep_ret
            print(
                f'epoch = {epch}, TestEpRet = {test_ep_ret}, Best = {test_ep_ret_best}, пройденный путь = {way}, из {len_pit_x} ям пройдено {count_pit}, из {len_stump_x} холмов пройдено {count_stump}'
                f', из {len_stairs} лестниц пройдено {count_stairs}')
            epch += 1
            if test_ep_ret > test_ep_ret_best:
                save_path = saver.save(sess, "content\\model.ckpt")
                print("Model saved in path: %s" % save_path)
                test_ep_ret_best = test_ep_ret
    np.savez_compressed('replay_{}'.format(name), np.array(buffer))

Example #3

Show file

File: sac1_vae_agents.py Project: pootatocakemagic/transfer_learning_rl

def sac1(apr,
         ts_env,
         env_fn,
         vae=None,
         x_train=None,
         actor_critic=core.mlp_actor_critic,
         ac_kwargs=dict(),
         seed=0,
         steps_per_epoch=5000,
         epochs=100,
         replay_size=int(2e6),
         gamma=0.99,
         reward_scale=1.0,
         polyak=0.995,
         lr=5e-4,
         alpha=0.2,
         batch_size=250,
         start_steps=10,
         max_ep_len_train=1000,
         max_ep_len_test=1000,
         logger_kwargs=dict(),
         save_freq=1):
    #   '''
    # def sac1(apr,ts_env, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0,
    #          steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0,
    #          polyak=0.995, lr=5e-4, alpha=0.2, batch_size=250, start_steps=10000,
    #          max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1):
    #   '''

    # if not apr.is_test:
    # logger = EpochLogger(**logger_kwargs)
    # logger.save_config(locals())
    frames = []
    buffer = []
    dw = VideoWriter(150, 100, 60, 'test.avi')
    tf.set_random_seed(seed)
    np.random.seed(seed)

    print(start_steps)
    epch = 1
    apr.l_ep_ret = -70000
    apr.l_ep_len = 1

    env, test_env = env_fn(3), env_fn(1)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Action limit for clamping: critically, assumes all dimensions share the same bound!
    act_limit = env.action_space.high[0]

    # Share information about action space with policy architecture
    ac_kwargs['action_space'] = env.action_space

    # Inputs to computation graph
    x_ph, apr.ph, x2_ph, r_ph, d_ph = core.placeholders(
        obs_dim, act_dim, obs_dim, None, None)

    # Main outputs from computation graph
    with tf.variable_scope('main'):
        mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic(
            x_ph, x2_ph, apr.ph, **ac_kwargs)

    # Target value network
    with tf.variable_scope('target'):
        _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic(
            x2_ph, x2_ph, apr.ph, **ac_kwargs)

    # Experience buffer
    # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

    # Count variables
    var_counts = tuple(
        core.count_vars(scope)
        for scope in ['main/pi', 'main/q1', 'main/q2', 'main'])
    print(('\nNumber of parameters: \t pi: %d, \t' + \
           'q1: %d, \t q2: %d, \t total: %d\n') % var_counts)

    ######
    if alpha == 'auto':
        target_entropy = (-np.prod(env.action_space.shape))

        log_alpha = tf.get_variable('log_alpha',
                                    dtype=tf.float32,
                                    initializer=0.0)
        alpha = tf.exp(log_alpha)

        alpha_loss = tf.reduce_mean(-log_alpha *
                                    tf.stop_gradient(logp_pi + target_entropy))

        alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1,
                                                 name='alpha_optimizer')
        train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                  var_list=[log_alpha])
    ######

    # Min Double-Q:
    min_q_pi = tf.minimum(q1_pi_, q2_pi_)

    # Targets for Q and V regression
    v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2)
    q_backup = r_ph + gamma * (1 - d_ph) * v_backup

    # Soft actor-critic losses
    pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
    q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
    q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
    value_loss = q1_loss + q2_loss

    # Policy train op
    # (has to be separate from value train op, because q1_pi appears in pi_loss)
    pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi'))

    # Value train op
    # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
    value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    value_params = get_vars('main/q')
    with tf.control_dependencies([train_pi_op]):
        train_value_op = value_optimizer.minimize(value_loss,
                                                  var_list=value_params)

    # Polyak averaging for target variables
    # (control flow because sess.run otherwise evaluates in nondeterministic order)
    with tf.control_dependencies([train_value_op]):
        target_update = tf.group([
            tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main)
            for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
        ])

    # All ops to call during one training step
    if isinstance(alpha, Number):
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
            tf.identity(alpha), train_pi_op, train_value_op, target_update
        ]
    else:
        step_ops = [
            pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op,
            train_value_op, target_update, train_alpha_op
        ]

    # Initializing targets to match main variables
    target_init = tf.group([
        tf.assign(v_targ, v_main)
        for v_main, v_targ in zip(get_vars('main'), get_vars('target'))
    ])

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(target_init)

    ##############################  save and restore  ############################

    saver = tf.train.Saver()

    #
    # if not os.path.exists(apr.checkpoint_path_r):
    #     os.makedirs(apr.checkpoint_path_r)

    if not os.path.exists(apr.checkpoint_path_wr):
        os.makedirs(apr.checkpoint_path_wr)

    # checkpoint_path_r = apr.checkpoint_path_r

    if apr.is_test or apr.is_restore_train:
        # ckpt = tf.train.get_checkpoint_state(apr.checkpoint_path_wr)
        print("Search ckpt...")
        # if ckpt and ckpt.model_checkpoint_path:
        # saver.restore(sess, ckpt.model_checkpoint_path)
        # print("Model restored.")
        save_path = saver.restore(sess, "content\\model.ckpt")
        print("Model restored in path: %s" % save_path)

    def get_action(o, deterministic=False):
        act_op = mu if deterministic else pi
        return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0]

    ##############################  test  ############################

    if apr.is_test:
        # test_env = gym.make(a_env)
        test_env = ts_env
        # test_env = BWg()
        ave_ep_ret = 0
        for j in range(start_steps):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            while not (d or (ep_len == max_ep_len_test)):
                action = get_action(o, True)
                o, r, d, _ = test_env.step(action)
                ep_ret += r
                ep_len += 1
                if apr.test_render:
                    frames.append(test_env.render(mode='rgb_array'))
                    # test_env.render()
            ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1)
            print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:',
                  ave_ep_ret, '--- {}  /'.format(j + 1), start_steps)
        return

    ##############################  train  ############################

    def test_agent(n=25):
        global sess, mu, pi, q1, q2, q1_pi, q2_pi
        for j in range(n):
            o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
            start_pos = test_env.pos[0]
            pit_x = test_env.pit_x
            stump_x = test_env.stump_x
            stairs_x = test_env.stairs_x
            while not (d or (ep_len == max_ep_len_test)):
                # Take deterministic actions at test time
                o, r, d, _ = test_env.step(get_action(o, True))
                ep_ret += r
                ep_len += 1
                if apr.test_render:
                    frames.append(test_env.render(mode='rgb_array'))
                    # test_env.render()
            finish_pos = test_env.pos[0]
            count_pit = 0
            count_stump = 0
            count_stairs = 0
            for pit in pit_x:
                if start_pos < pit < finish_pos:
                    count_pit += 1
            for stump in stump_x:
                if start_pos < stump < finish_pos:
                    count_stump += 1
            for stair in stairs_x:
                if start_pos < stair < finish_pos:
                    count_stairs += 1
            apr.l_ep_ret = int(ep_ret)
            apr.l_ep_len = ep_len
            # print(apr.l_ep_ret)
            return count_pit, count_stump, count_stairs, finish_pos - start_pos, len(
                pit_x), len(stump_x), len(stairs_x)

    # --------------------------------------------

    start_time = time.time()
    if vae is None and x_train is None:
        o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
    elif vae is not None:
        o, r, d, ep_ret, ep_len = vae.get_data()[0], 0, False, 0, 0
    elif x_train is not None:
        count = 0
        data = x_train[count]
        o, ep_ret, ep_len = data[0], 0, 0
    total_steps = steps_per_epoch * epochs
    print(total_steps)

    test_ep_ret = -10000.0
    test_ep_ret_best = apr.bestr
    n = 50
    min_n = 50
    m = 50
    max_m = 50
    mix_offset = 5000000000  # отвечает за число шагов после которых начинается смешивание буферов
    mix_step = 1  # определяет за какое число шагов смешивание изменяется на единицу
    # Main loop: collect experience in env and update/log each epoch
    vae1 = Vae(config)
    vae2 = Vae(config)
    x_train = np.load('replay_лестницы.npz', allow_pickle=True)['arr_0']
    y = np.array([np.array(xi) for xi in x_train[:, 0]])
    for i in range(1, 5):
        if i < 3:
            temp = np.array([np.array(xi) for xi in x_train[:, i]])
        else:
            temp = np.array([np.array(xi)
                             for xi in x_train[:, i]]).reshape(-1, 1)
        y = np.concatenate([y, temp], axis=1)
    vae1.fit_vae(y)
    # x_train = np.load('replay_{}.npz'.format(ENV))['arr_0']
    x_train = np.load('replay_ямы.npz', allow_pickle=True)['arr_0']
    y = np.array([np.array(xi) for xi in x_train[:, 0]])
    for i in range(1, 5):
        if i < 3:
            temp = np.array([np.array(xi) for xi in x_train[:, i]])
        else:
            temp = np.array([np.array(xi)
                             for xi in x_train[:, i]]).reshape(-1, 1)
        y = np.concatenate([y, temp], axis=1)
    vae2.fit_vae(y)
    for t in range(total_steps):
        for j in range(5000):
            # t = time.time()
            batch = get_vae_batches(vae1, vae2, n, m)
            if t > mix_offset:
                if t % mix_step == 0 and n != min_n:
                    n -= 1
                    m += 1
            # print(time.time()-t)
            feed_dict = {
                x_ph: batch['obs1'],
                x2_ph: batch['obs2'],
                apr.ph: batch['acts'],
                r_ph: batch['rews'],
                d_ph: batch['done'],
            }
            # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
            outs = sess.run(step_ops, feed_dict)

        # End of epoch wrap-up
        epoch = t
        count_pit, count_stump, count_stairs, way, len_pit_x, len_stump_x, len_stairs = test_agent(
            1)
        test_ep_ret = apr.l_ep_ret
        print(
            f'epoch = {epch}, TestEpRet = {test_ep_ret}, Best = {test_ep_ret_best}, пройденный путь = {way},ям - {count_pit}/{len_pit_x}, лестниц - {count_stairs}/{len_stairs}'
        )
        if (count_pit + count_stairs) / (len_pit_x + len_stairs) > 0.85:
            break
        epch += 1
        # if test_ep_ret > test_ep_ret_best:
        #     save_path = saver.save(sess, "content\\model.ckpt")
        #     print("Model saved in path: %s" % save_path)
        #     test_ep_ret_best = test_ep_ret
    height, width, layers = frames[0].shape
    out = cv2.VideoWriter('out.avi', cv2.VideoWriter_fourcc(*'DIVX'), 60,
                          (width, height))

    for i in range(len(frames)):
        out.write(frames[i])
    out.release()

Example #4

Show file

File: sac1.py Project: pootatocakemagic/transfer_learning_rl

    def init_model(self,
                   actor_critic=core.mlp_actor_critic,
                   ac_kwargs=dict(),
                   seed=0,
                   steps_per_epoch=5000,
                   epochs=100,
                   replay_size=int(2e6),
                   gamma=0.99,
                   reward_scale=1.0,
                   polyak=0.995,
                   lr=5e-4,
                   alpha=0.2,
                   batch_size=250,
                   start_steps=10,
                   max_ep_len_train=1000,
                   max_ep_len_test=1000):
        frames = []

        tf.set_random_seed(seed)
        np.random.seed(seed)

        print(start_steps)

        self.apr.l_ep_ret = -70000
        self.apr.l_ep_len = 1

        env, test_env = self.env_fn(3), self.env_fn(1)
        obs_dim = env.observation_space.shape[0]
        act_dim = env.action_space.shape[0]

        # Action limit for clamping: critically, assumes all dimensions share the same bound!
        act_limit = env.action_space.high[0]

        # Share information about action space with policy architecture
        ac_kwargs['action_space'] = env.action_space

        # Inputs to computation graph
        self.x_ph, self.apr.ph, x2_ph, r_ph, d_ph = core.placeholders(
            obs_dim, act_dim, obs_dim, None, None)

        # Main outputs from computation graph
        with tf.variable_scope('main1'):
            print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
            self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic(
                self.x_ph, x2_ph, self.apr.ph, **ac_kwargs)

        # Target value network
        with tf.variable_scope('target1'):
            _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic(
                x2_ph, x2_ph, self.apr.ph, **ac_kwargs)

        # Experience buffer
        # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size)

        # Count variables
        var_counts = tuple(
            core.count_vars(scope)
            for scope in ['main1/pi', 'main1/q1', 'main1/q2', 'main1'])
        print(('\nNumber of parameters: \t pi: %d, \t' + \
               'q1: %d, \t q2: %d, \t total: %d\n') % var_counts)

        ######
        if alpha == 'auto':
            target_entropy = (-np.prod(env.action_space.shape))

            log_alpha = tf.get_variable('log_alpha1',
                                        dtype=tf.float32,
                                        initializer=0.0)
            alpha = tf.exp(log_alpha)

            alpha_loss = tf.reduce_mean(
                -log_alpha * tf.stop_gradient(logp_pi + target_entropy))

            alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1,
                                                     name='alpha_optimizer')
            train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss,
                                                      var_list=[log_alpha])
        ######

        # Min Double-Q:
        min_q_pi = tf.minimum(q1_pi_, q2_pi_)

        # Targets for Q and V regression
        v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2)
        q_backup = r_ph + gamma * (1 - d_ph) * v_backup

        # Soft actor-critic losses
        pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi)
        q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2)
        q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2)
        value_loss = q1_loss + q2_loss

        # Policy train op
        # (has to be separate from value train op, because q1_pi appears in pi_loss)
        pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        train_pi_op = pi_optimizer.minimize(pi_loss,
                                            var_list=get_vars('main1/pi'))

        # Value train op
        # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order)
        value_optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        value_params = get_vars('main1/q')
        with tf.control_dependencies([train_pi_op]):
            train_value_op = value_optimizer.minimize(value_loss,
                                                      var_list=value_params)

        # Polyak averaging for target variables
        # (control flow because sess.run otherwise evaluates in nondeterministic order)
        with tf.control_dependencies([train_value_op]):
            target_update = tf.group([
                tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for
                v_main, v_targ in zip(get_vars('main1'), get_vars('target1'))
            ])

        # All ops to call during one training step
        if isinstance(alpha, Number):
            step_ops = [
                pi_loss, q1_loss, q2_loss, q1, q2, logp_pi,
                tf.identity(alpha), train_pi_op, train_value_op, target_update
            ]
        else:
            step_ops = [
                pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op,
                train_value_op, target_update, train_alpha_op
            ]

        # Initializing targets to match main variables
        target_init = tf.group([
            tf.assign(v_targ, v_main)
            for v_main, v_targ in zip(get_vars('main1'), get_vars('target1'))
        ])
        saver = tf.train.Saver()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        self.sess.run(target_init)
        if not os.path.exists(self.apr.checkpoint_path_wr):
            os.makedirs(self.apr.checkpoint_path_wr)

        # checkpoint_path_r = apr.checkpoint_path_r

        if self.apr.is_test or self.apr.is_restore_train:
            # ckpt = tf.train.get_checkpoint_state(apr.checkpoint_path_wr)
            print("Search ckpt...")
            # if ckpt and ckpt.model_checkpoint_path:
            # saver.restore(sess, ckpt.model_checkpoint_path)
            # print("Model restored.")
            save_path = saver.restore(self.sess, "content1\\model.ckpt")
            print("Model restored in path: %s" % save_path)
        if self.apr.is_test:
            return
            # test_env = gym.make(a_env)
            test_env = self.ts_env
            # test_env = BWg()
            ave_ep_ret = 0
            for j in range(start_steps):
                o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
                while not (d or (ep_len == max_ep_len_test)):
                    action = self.get_action(o, True)
                    o, r, d, _ = test_env.step(action)
                    ep_ret += r
                    ep_len += 1
                    if self.apr.test_render:
                        frames.append(test_env.render(mode='rgb_array'))
                        # test_env.render()
                ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1)
                print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:',
                      ave_ep_ret, '--- {}  /'.format(j + 1), start_steps)
            return

        ##############################  train  ############################

        def test_agent(n=25):
            global mu, pi, q1, q2, q1_pi, q2_pi
            for j in range(n):
                o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0
                while not (d or (ep_len == max_ep_len_test)):
                    # Take deterministic actions at test time
                    o, r, d, _ = test_env.step(self.get_action(o, True))
                    ep_ret += r
                    ep_len += 1
                    if self.apr.test_render:
                        frames.append(test_env.render(mode='rgb_array'))
                        # test_env.render()
                self.apr.l_ep_ret = int(ep_ret)
                self.apr.l_ep_len = ep_len
                # print(apr.l_ep_ret)

        # --------------------------------------------

        start_time = time.time()
        if self.vae is None:
            o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0
        else:
            o, r, d, ep_ret, ep_len = self.vae.get_data(), 0, False, 0, 0
        total_steps = steps_per_epoch * epochs

        test_ep_ret = -10000.0
        test_ep_ret_best = self.apr.bestr

        # Main loop: collect experience in env and update/log each epoch
        for t in range(total_steps):
            """
            Until start_steps have elapsed, randomly sample actions
            from a uniform distribution for better exploration. Afterwards, 
            use the learned policy. 
            """
            if t > start_steps:
                a = self.get_action(o)
            else:
                a = env.action_space.sample()
            if self.vae is None:
                o2, r, d, _ = env.step(a)
                # env.render(mode='rgb_array')
                ep_ret += r
                ep_len += 1
                self.replay_buffer.store(o, a, r, o2, d)
                o = o2
            else:
                o = self.vae.get_data()
                a = self.sac.get_action(o)
                o2, r = self.densenet.get_data(
                    np.hstack((o.reshape(1, -1), a.reshape(1, -1))))
                ep_ret += r
                ep_len += 1
                self.replay_buffer.store(o, a, r.reshape(-1), o2.reshape(-1),
                                         d)

            # End of episode. Training (ep_len times).
            if d or (ep_len == max_ep_len_train):
                """
                Perform all SAC updates at the end of the trajectory.
                This is a slight difference from the SAC specified in the
                original paper.
                """
                for j in range(ep_len):
                    batch = self.replay_buffer.sample_batch(batch_size)
                    feed_dict = {
                        self.x_ph: batch['obs1'],
                        x2_ph: batch['obs2'],
                        self.apr.ph: batch['acts'],
                        r_ph: batch['rews'],
                        d_ph: batch['done'],
                    }
                    # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update]
                    outs = self.sess.run(step_ops, feed_dict)
                o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0

            # End of epoch wrap-up
            if t > 0 and t % steps_per_epoch == 0:
                epoch = t // steps_per_epoch
                test_agent(1)
                test_ep_ret = self.apr.l_ep_ret
                print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best)
                if test_ep_ret > test_ep_ret_best:
                    save_path = saver.save(self.sess, "content1\\model.ckpt")
                    print("Model saved in path: %s" % save_path)
                    test_ep_ret_best = test_ep_ret