def sac1(args, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, batch_size=200, start_steps=10000, max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): """ Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``mu`` (batch, act_dim) | Computes mean actions from policy | given states. ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. Critical: must be differentiable | with respect to policy parameters all | the way through action sampling. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). ``q2_pi`` (batch,) | Gives the composition of ``q2`` and | ``pi`` for states in ``x_ph``: | q2(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for policy/value/alpha learning). alpha (float/'auto'): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) / 'auto': alpha is automated. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ if not args.is_test: logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(3), env_fn(1) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic( x_ph, x2_ph, a_ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic( x2_ph, x2_ph, a_ph, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n')%var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) ############################## save and restore ############################ saver = tf.train.Saver() checkpoint_path = logger_kwargs['output_dir'] + '/checkpoints' if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) if args.is_test or args.is_restore_train: ckpt = tf.train.get_checkpoint_state(checkpoint_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model restored.") def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] ############################## test ############################ if args.is_test: test_env = gym.make(args.env) ave_ep_ret = 0 for j in range(10000): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not d: # (d or (ep_len == 2000)): o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 if args.test_render: test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '({}/10000)'.format(j + 1)) return ############################## train ############################ def test_agent(n=25): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 # test_env.render() logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs ep_index = 0 test_ep_ret_best = test_ep_ret = -10000.0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) # d = False if ep_len==max_ep_len_train else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len_train): ep_index += 1 print('episode: {}, reward: {}'.format(ep_index, ep_ret / reward_scale)) """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(int(1.5 * ep_len)): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) logger.store(LossPi=outs[0], LossQ1=outs[1], LossQ2=outs[2], Q1Vals=outs[3], Q2Vals=outs[4], LogPi=outs[5], Alpha=outs[6]) logger.store(EpRet=ep_ret / reward_scale, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch test_agent(10) # test_ep_ret = logger.get_stats('TestEpRet')[0] # print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) if logger.get_stats('TestEpRet')[0] >= 280: print('Recalculating TestEpRet...') test_agent(100) test_ep_ret = logger.get_stats('TestEpRet')[0] # logger.epoch_dict['TestEpRet'] = [] if test_ep_ret >= 300: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(ep_index, test_ep_ret)) exit() print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) # logger.store(): store the data; logger.log_tabular(): log the data; logger.dump_tabular(): write the data # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('Num_Ep', ep_index) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=False) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Alpha', average_only=True) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) # logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LogPi', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ1', average_only=True) logger.log_tabular('LossQ2', average_only=True) # logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() # Save model if ((epoch % save_freq == 0) or (epoch == epochs - 1)) and test_ep_ret > test_ep_ret_best: save_path = saver.save(sess, checkpoint_path + '/model.ckpt', t) print("Model saved in path: %s" % save_path) test_ep_ret_best = test_ep_ret
def sac1(apr, ts_env, env_fn, replay_buffer, name, vae=None, x_train=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, batch_size=250, start_steps=10, max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): # ''' # def sac1(apr,ts_env, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, # polyak=0.995, lr=5e-4, alpha=0.2, batch_size=250, start_steps=10000, # max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): # ''' # if not apr.is_test: # logger = EpochLogger(**logger_kwargs) # logger.save_config(locals()) frames = [] buffer = [] tf.set_random_seed(seed) np.random.seed(seed) print(start_steps) epch = 1 apr.l_ep_ret = -70000 apr.l_ep_len = 1 env, test_env = env_fn(3), env_fn(1) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, apr.ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic( x_ph, x2_ph, apr.ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic( x2_ph, x2_ph, apr.ph, **ac_kwargs) # Experience buffer # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n') % var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) ############################## save and restore ############################ saver = tf.train.Saver() # # if not os.path.exists(apr.checkpoint_path_r): # os.makedirs(apr.checkpoint_path_r) if not os.path.exists(apr.checkpoint_path_wr): os.makedirs(apr.checkpoint_path_wr) # checkpoint_path_r = apr.checkpoint_path_r if apr.is_test or apr.is_restore_train: # ckpt = tf.train.get_checkpoint_state(apr.checkpoint_path_wr) print("Search ckpt...") # if ckpt and ckpt.model_checkpoint_path: # saver.restore(sess, ckpt.model_checkpoint_path) # print("Model restored.") save_path = saver.restore(sess, "content\\model.ckpt") print("Model restored in path: %s" % save_path) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] ############################## test ############################ if apr.is_test: # test_env = gym.make(a_env) test_env = ts_env # test_env = BWg() ave_ep_ret = 0 for j in range(start_steps): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len_test)): action = get_action(o, True) o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 if apr.test_render: frames.append(test_env.render(mode='rgb_array')) # test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '--- {} /'.format(j + 1), start_steps) return ############################## train ############################ def test_agent(n=25): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 start_pos = test_env.pos[0] pit_x = test_env.pit_x stump_x = test_env.stump_x stairs_x = test_env.stairs_x while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 if apr.test_render: frames.append(test_env.render(mode='rgb_array')) # test_env.render() finish_pos = test_env.pos[0] count_pit = 0 count_stump = 0 count_stairs = 0 for pit in pit_x: if start_pos < pit < finish_pos: count_pit += 1 for stump in stump_x: if start_pos < stump < finish_pos: count_stump += 1 for stair in stairs_x: if start_pos < stair < finish_pos: count_stairs += 1 apr.l_ep_ret = int(ep_ret) apr.l_ep_len = ep_len # print(apr.l_ep_ret) return count_pit, count_stump, count_stairs, finish_pos - start_pos, len( pit_x), len(stump_x), len(stairs_x) # -------------------------------------------- start_time = time.time() if vae is None and x_train is None: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 elif vae is not None: o, r, d, ep_ret, ep_len = vae[0].get_data()[0], 0, False, 0, 0 elif x_train is not None: count = 0 data = x_train[count] o, ep_ret, ep_len = data[0], 0, 0 total_steps = steps_per_epoch * epochs test_ep_ret = -10000.0 test_ep_ret_best = apr.bestr # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = get_action(o) else: a = env.action_space.sample() if vae is None and x_train is None: o2, r, d, _ = env.step(a) # env.render(mode='rgb_array') ep_ret += r ep_len += 1 replay_buffer.store(o, a, r, o2, d) buffer += [[o, o2, a, r, d]] o = o2 elif vae is not None: o_1, o2_1, a_1, r_1, d_1 = vae.get_data() ep_ret += r_1 ep_len += 1 replay_buffer.store(o, a, r, o2, d) elif x_train is not None: data = x_train[count] # data = random.choice(x_train) o, o2, a, r, d = data count += 1 ep_ret += r ep_len += 1 replay_buffer.store(o, a, r, o2, d) # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len_train): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], apr.ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch count_pit, count_stump, count_stairs, way, len_pit_x, len_stump_x, len_stairs = test_agent( 1) test_ep_ret = apr.l_ep_ret print( f'epoch = {epch}, TestEpRet = {test_ep_ret}, Best = {test_ep_ret_best}, пройденный путь = {way}, из {len_pit_x} ям пройдено {count_pit}, из {len_stump_x} холмов пройдено {count_stump}' f', из {len_stairs} лестниц пройдено {count_stairs}') epch += 1 if test_ep_ret > test_ep_ret_best: save_path = saver.save(sess, "content\\model.ckpt") print("Model saved in path: %s" % save_path) test_ep_ret_best = test_ep_ret np.savez_compressed('replay_{}'.format(name), np.array(buffer))
def sac1(apr, ts_env, env_fn, vae=None, x_train=None, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, batch_size=250, start_steps=10, max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): # ''' # def sac1(apr,ts_env, env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, # steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, # polyak=0.995, lr=5e-4, alpha=0.2, batch_size=250, start_steps=10000, # max_ep_len_train=1000, max_ep_len_test=1000, logger_kwargs=dict(), save_freq=1): # ''' # if not apr.is_test: # logger = EpochLogger(**logger_kwargs) # logger.save_config(locals()) frames = [] buffer = [] dw = VideoWriter(150, 100, 60, 'test.avi') tf.set_random_seed(seed) np.random.seed(seed) print(start_steps) epch = 1 apr.l_ep_ret = -70000 apr.l_ep_len = 1 env, test_env = env_fn(3), env_fn(1) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, apr.ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): mu, pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic( x_ph, x2_ph, apr.ph, **ac_kwargs) # Target value network with tf.variable_scope('target'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic( x2_ph, x2_ph, apr.ph, **ac_kwargs) # Experience buffer # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n') % var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean(-log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) ############################## save and restore ############################ saver = tf.train.Saver() # # if not os.path.exists(apr.checkpoint_path_r): # os.makedirs(apr.checkpoint_path_r) if not os.path.exists(apr.checkpoint_path_wr): os.makedirs(apr.checkpoint_path_wr) # checkpoint_path_r = apr.checkpoint_path_r if apr.is_test or apr.is_restore_train: # ckpt = tf.train.get_checkpoint_state(apr.checkpoint_path_wr) print("Search ckpt...") # if ckpt and ckpt.model_checkpoint_path: # saver.restore(sess, ckpt.model_checkpoint_path) # print("Model restored.") save_path = saver.restore(sess, "content\\model.ckpt") print("Model restored in path: %s" % save_path) def get_action(o, deterministic=False): act_op = mu if deterministic else pi return sess.run(act_op, feed_dict={x_ph: o.reshape(1, -1)})[0] ############################## test ############################ if apr.is_test: # test_env = gym.make(a_env) test_env = ts_env # test_env = BWg() ave_ep_ret = 0 for j in range(start_steps): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len_test)): action = get_action(o, True) o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 if apr.test_render: frames.append(test_env.render(mode='rgb_array')) # test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '--- {} /'.format(j + 1), start_steps) return ############################## train ############################ def test_agent(n=25): global sess, mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 start_pos = test_env.pos[0] pit_x = test_env.pit_x stump_x = test_env.stump_x stairs_x = test_env.stairs_x while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time o, r, d, _ = test_env.step(get_action(o, True)) ep_ret += r ep_len += 1 if apr.test_render: frames.append(test_env.render(mode='rgb_array')) # test_env.render() finish_pos = test_env.pos[0] count_pit = 0 count_stump = 0 count_stairs = 0 for pit in pit_x: if start_pos < pit < finish_pos: count_pit += 1 for stump in stump_x: if start_pos < stump < finish_pos: count_stump += 1 for stair in stairs_x: if start_pos < stair < finish_pos: count_stairs += 1 apr.l_ep_ret = int(ep_ret) apr.l_ep_len = ep_len # print(apr.l_ep_ret) return count_pit, count_stump, count_stairs, finish_pos - start_pos, len( pit_x), len(stump_x), len(stairs_x) # -------------------------------------------- start_time = time.time() if vae is None and x_train is None: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 elif vae is not None: o, r, d, ep_ret, ep_len = vae.get_data()[0], 0, False, 0, 0 elif x_train is not None: count = 0 data = x_train[count] o, ep_ret, ep_len = data[0], 0, 0 total_steps = steps_per_epoch * epochs print(total_steps) test_ep_ret = -10000.0 test_ep_ret_best = apr.bestr n = 50 min_n = 50 m = 50 max_m = 50 mix_offset = 5000000000 # отвечает за число шагов после которых начинается смешивание буферов mix_step = 1 # определяет за какое число шагов смешивание изменяется на единицу # Main loop: collect experience in env and update/log each epoch vae1 = Vae(config) vae2 = Vae(config) x_train = np.load('replay_лестницы.npz', allow_pickle=True)['arr_0'] y = np.array([np.array(xi) for xi in x_train[:, 0]]) for i in range(1, 5): if i < 3: temp = np.array([np.array(xi) for xi in x_train[:, i]]) else: temp = np.array([np.array(xi) for xi in x_train[:, i]]).reshape(-1, 1) y = np.concatenate([y, temp], axis=1) vae1.fit_vae(y) # x_train = np.load('replay_{}.npz'.format(ENV))['arr_0'] x_train = np.load('replay_ямы.npz', allow_pickle=True)['arr_0'] y = np.array([np.array(xi) for xi in x_train[:, 0]]) for i in range(1, 5): if i < 3: temp = np.array([np.array(xi) for xi in x_train[:, i]]) else: temp = np.array([np.array(xi) for xi in x_train[:, i]]).reshape(-1, 1) y = np.concatenate([y, temp], axis=1) vae2.fit_vae(y) for t in range(total_steps): for j in range(5000): # t = time.time() batch = get_vae_batches(vae1, vae2, n, m) if t > mix_offset: if t % mix_step == 0 and n != min_n: n -= 1 m += 1 # print(time.time()-t) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], apr.ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = sess.run(step_ops, feed_dict) # End of epoch wrap-up epoch = t count_pit, count_stump, count_stairs, way, len_pit_x, len_stump_x, len_stairs = test_agent( 1) test_ep_ret = apr.l_ep_ret print( f'epoch = {epch}, TestEpRet = {test_ep_ret}, Best = {test_ep_ret_best}, пройденный путь = {way},ям - {count_pit}/{len_pit_x}, лестниц - {count_stairs}/{len_stairs}' ) if (count_pit + count_stairs) / (len_pit_x + len_stairs) > 0.85: break epch += 1 # if test_ep_ret > test_ep_ret_best: # save_path = saver.save(sess, "content\\model.ckpt") # print("Model saved in path: %s" % save_path) # test_ep_ret_best = test_ep_ret height, width, layers = frames[0].shape out = cv2.VideoWriter('out.avi', cv2.VideoWriter_fourcc(*'DIVX'), 60, (width, height)) for i in range(len(frames)): out.write(frames[i]) out.release()
def init_model(self, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(2e6), gamma=0.99, reward_scale=1.0, polyak=0.995, lr=5e-4, alpha=0.2, batch_size=250, start_steps=10, max_ep_len_train=1000, max_ep_len_test=1000): frames = [] tf.set_random_seed(seed) np.random.seed(seed) print(start_steps) self.apr.l_ep_ret = -70000 self.apr.l_ep_len = 1 env, test_env = self.env_fn(3), self.env_fn(1) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph self.x_ph, self.apr.ph, x2_ph, r_ph, d_ph = core.placeholders( obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main1'): print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') self.mu, self.pi, logp_pi, logp_pi2, q1, q2, q1_pi, q2_pi = actor_critic( self.x_ph, x2_ph, self.apr.ph, **ac_kwargs) # Target value network with tf.variable_scope('target1'): _, _, logp_pi_, _, _, _, q1_pi_, q2_pi_ = actor_critic( x2_ph, x2_ph, self.apr.ph, **ac_kwargs) # Experience buffer # replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main1/pi', 'main1/q1', 'main1/q2', 'main1']) print(('\nNumber of parameters: \t pi: %d, \t' + \ 'q1: %d, \t q2: %d, \t total: %d\n') % var_counts) ###### if alpha == 'auto': target_entropy = (-np.prod(env.action_space.shape)) log_alpha = tf.get_variable('log_alpha1', dtype=tf.float32, initializer=0.0) alpha = tf.exp(log_alpha) alpha_loss = tf.reduce_mean( -log_alpha * tf.stop_gradient(logp_pi + target_entropy)) alpha_optimizer = tf.train.AdamOptimizer(learning_rate=lr * 0.1, name='alpha_optimizer') train_alpha_op = alpha_optimizer.minimize(loss=alpha_loss, var_list=[log_alpha]) ###### # Min Double-Q: min_q_pi = tf.minimum(q1_pi_, q2_pi_) # Targets for Q and V regression v_backup = tf.stop_gradient(min_q_pi - alpha * logp_pi2) q_backup = r_ph + gamma * (1 - d_ph) * v_backup # Soft actor-critic losses pi_loss = tf.reduce_mean(alpha * logp_pi - q1_pi) q1_loss = 0.5 * tf.reduce_mean((q_backup - q1)**2) q2_loss = 0.5 * tf.reduce_mean((q_backup - q2)**2) value_loss = q1_loss + q2_loss # Policy train op # (has to be separate from value train op, because q1_pi appears in pi_loss) pi_optimizer = tf.train.AdamOptimizer(learning_rate=lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main1/pi')) # Value train op # (control dep of train_pi_op because sess.run otherwise evaluates in nondeterministic order) value_optimizer = tf.train.AdamOptimizer(learning_rate=lr) value_params = get_vars('main1/q') with tf.control_dependencies([train_pi_op]): train_value_op = value_optimizer.minimize(value_loss, var_list=value_params) # Polyak averaging for target variables # (control flow because sess.run otherwise evaluates in nondeterministic order) with tf.control_dependencies([train_value_op]): target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main1'), get_vars('target1')) ]) # All ops to call during one training step if isinstance(alpha, Number): step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, tf.identity(alpha), train_pi_op, train_value_op, target_update ] else: step_ops = [ pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update, train_alpha_op ] # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main1'), get_vars('target1')) ]) saver = tf.train.Saver() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.sess.run(target_init) if not os.path.exists(self.apr.checkpoint_path_wr): os.makedirs(self.apr.checkpoint_path_wr) # checkpoint_path_r = apr.checkpoint_path_r if self.apr.is_test or self.apr.is_restore_train: # ckpt = tf.train.get_checkpoint_state(apr.checkpoint_path_wr) print("Search ckpt...") # if ckpt and ckpt.model_checkpoint_path: # saver.restore(sess, ckpt.model_checkpoint_path) # print("Model restored.") save_path = saver.restore(self.sess, "content1\\model.ckpt") print("Model restored in path: %s" % save_path) if self.apr.is_test: return # test_env = gym.make(a_env) test_env = self.ts_env # test_env = BWg() ave_ep_ret = 0 for j in range(start_steps): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len_test)): action = self.get_action(o, True) o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 if self.apr.test_render: frames.append(test_env.render(mode='rgb_array')) # test_env.render() ave_ep_ret = (j * ave_ep_ret + ep_ret) / (j + 1) print('ep_len', ep_len, 'ep_ret:', ep_ret, 'ave_ep_ret:', ave_ep_ret, '--- {} /'.format(j + 1), start_steps) return ############################## train ############################ def test_agent(n=25): global mu, pi, q1, q2, q1_pi, q2_pi for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len_test)): # Take deterministic actions at test time o, r, d, _ = test_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 if self.apr.test_render: frames.append(test_env.render(mode='rgb_array')) # test_env.render() self.apr.l_ep_ret = int(ep_ret) self.apr.l_ep_len = ep_len # print(apr.l_ep_ret) # -------------------------------------------- start_time = time.time() if self.vae is None: o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 else: o, r, d, ep_ret, ep_len = self.vae.get_data(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs test_ep_ret = -10000.0 test_ep_ret_best = self.apr.bestr # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): """ Until start_steps have elapsed, randomly sample actions from a uniform distribution for better exploration. Afterwards, use the learned policy. """ if t > start_steps: a = self.get_action(o) else: a = env.action_space.sample() if self.vae is None: o2, r, d, _ = env.step(a) # env.render(mode='rgb_array') ep_ret += r ep_len += 1 self.replay_buffer.store(o, a, r, o2, d) o = o2 else: o = self.vae.get_data() a = self.sac.get_action(o) o2, r = self.densenet.get_data( np.hstack((o.reshape(1, -1), a.reshape(1, -1)))) ep_ret += r ep_len += 1 self.replay_buffer.store(o, a, r.reshape(-1), o2.reshape(-1), d) # End of episode. Training (ep_len times). if d or (ep_len == max_ep_len_train): """ Perform all SAC updates at the end of the trajectory. This is a slight difference from the SAC specified in the original paper. """ for j in range(ep_len): batch = self.replay_buffer.sample_batch(batch_size) feed_dict = { self.x_ph: batch['obs1'], x2_ph: batch['obs2'], self.apr.ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'], } # step_ops = [pi_loss, q1_loss, q2_loss, q1, q2, logp_pi, alpha, train_pi_op, train_value_op, target_update] outs = self.sess.run(step_ops, feed_dict) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # End of epoch wrap-up if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch test_agent(1) test_ep_ret = self.apr.l_ep_ret print('TestEpRet', test_ep_ret, 'Best:', test_ep_ret_best) if test_ep_ret > test_ep_ret_best: save_path = saver.save(self.sess, "content1\\model.ckpt") print("Model saved in path: %s" % save_path) test_ep_ret_best = test_ep_ret