def run(args): logger.configure( f'logs/{args["dataset"]}/pam/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}' ) logger.info(args) pool = mp.Pool(mp.cpu_count()) pam_arg = args.copy() if 'margin' not in pam_arg.keys(): best_margin = pool.map(find_best_margin, make_arg_list(pam_arg)) best_margin = np.mean(best_margin, 0) if 'verbose' in pam_arg.keys() and pam_arg['verbose']: for i in range(len(best_margin)): logger.record_tabular(f'[PAM] margin = {MARGINS[i]}', best_margin[i]) logger.dump_tabular() best_margin = MARGINS[best_margin.argmax()] logger.record_tabular('[PAM] best margin', best_margin) pam_arg['margin'] = best_margin results_pam = pool.map(run_pam, make_arg_list(pam_arg)) logger.record_tabular('[PAM] accuracy mean', np.mean(results_pam)) logger.record_tabular('[PAM] accuracy max', np.max(results_pam)) logger.record_tabular('[PAM] accuracy min', np.min(results_pam)) logger.record_tabular('[PAM] accuracy std', np.std(results_pam)) logger.dump_tabular()
def run(args): logger.configure( f'logs/{args["dataset"]}/svm/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}' ) logger.info(args) pool = mp.Pool(mp.cpu_count()) svm_arg = args.copy() if 'C1' not in svm_arg.keys(): best_c1 = pool.map(find_best_c1, make_arg_list(svm_arg)) best_c1 = np.mean(best_c1, 0) if 'verbose' in svm_arg.keys() and svm_arg['verbose']: for i in range(len(best_c1)): logger.record_tabular(f'[C-SVM] C1 = {CLASS_WEIGHTS[i]}', best_c1[i]) logger.dump_tabular() best_c1 = CLASS_WEIGHTS[best_c1.argmax()] logger.record_tabular('[C-SVM] best C1', best_c1) svm_arg['C1'] = best_c1 results_svm = pool.map(run_c_svm, make_arg_list(svm_arg)) logger.record_tabular('[C-SVM] accuracy mean', np.mean(results_svm)) logger.record_tabular('[C-SVM] accuracy max', np.max(results_svm)) logger.record_tabular('[C-SVM] accuracy min', np.min(results_svm)) logger.record_tabular('[C-SVM] accuracy std', np.std(results_svm)) logger.dump_tabular()
def call(self, on_policy): env_runner, model, buffer, steps = self.env_runner, self.model, \ self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = env_runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(env_runner.batch_ob_shape) actions = actions.reshape([env_runner.nbatch]) rewards = rewards.reshape([env_runner.nbatch]) mus = mus.reshape([env_runner.nbatch, env_runner.nact]) dones = dones.reshape([env_runner.nbatch]) masks = masks.reshape([env_runner.batch_ob_shape[0]]) names_ops, values_ops = model.predict( obs, actions, rewards, dones, mus, model.initial_state, masks, steps ) if on_policy and (int(steps/env_runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps/(time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get # done=True at each loss of life, not just at the terminal # state. Thus, this is mean until end of life, not end of # episode. For true episode rewards, see the monitor # files in the log folder. logger.record_tabular( "mean_episode_length", self.episode_stats.mean_length() ) logger.record_tabular( "mean_episode_reward", self.episode_stats.mean_reward() ) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular()
def find_best_alpha_val(kargs): if len(kargs['alpha']) == 1: return {'alpha': kargs['alpha'][0]} args = kargs.copy() pool = mp.Pool(mp.cpu_count()) results = [] for alpha in kargs['alpha']: args['alpha'] = alpha res = [ res['val_acc'] for res in pool.map(run_nn_peer_val, make_arg_list(args)) ] res = np.mean(res, axis=0)[-1] if 'verbose' in args.keys() and args['verbose']: logger.record_tabular(f'[PEER] alpha = {alpha}', res) results.append(res) pool.close() pool.join() logger.dump_tabular() best_alpha = kargs['alpha'][np.argmax(results)] return {'alpha': best_alpha}
def train(self): """ CG: the function that conducts ensemble training. :return: """ # Set up parameters for the training process. self._n_epochs = self._base_ac_params['n_epochs'] self._epoch_length = self._base_ac_params['epoch_length'] self._n_train_repeat = self._base_ac_params['n_train_repeat'] self._n_initial_exploration_steps = self._base_ac_params[ 'n_initial_exploration_steps'] self._eval_render = self._base_ac_params['eval_render'] self._eval_n_episodes = self._base_ac_params['eval_n_episodes'] self._eval_deterministic = self._base_ac_params['eval_deterministic'] # Set up the evaluation environment. if self._eval_n_episodes > 0: with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(self._env) # Import required libraries for training. import random import math import operator import numpy as np # Initialize the sampler. alg_ins = random.choice(self._alg_instances) self._sampler.initialize(self._env, alg_ins[0].policy, self._pool) # Perform the training/evaluation process. num_episode = 0. with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.log('Epoch #%d | ' % epoch) for t in range(self._epoch_length): isEpisodeEnd = self._sampler.sample() # If an episode is ended, we need to update performance statistics for each AC instance and # pick randomly another AC instance for next episode of exploration. if isEpisodeEnd: num_episode = num_episode + 1. alg_ins[1] = 0.9 * alg_ins[ 1] + 0.1 * self._sampler._last_path_return alg_ins[2] = alg_ins[2] + 1. if self._use_ucb: # Select an algorithm instance based on UCB. selected = False for ains in self._alg_instances: if ains[2] < 1.: alg_ins = ains selected = True break else: ains[3] = ains[1] + math.sqrt( 2.0 * math.log(num_episode) / ains[2]) if not selected: alg_ins = max(self._alg_instances, key=operator.itemgetter(3)) else: # Select an algorithm instance uniformly at random. alg_ins = random.choice(self._alg_instances) self._sampler.set_policy(alg_ins[0].policy) if not self._sampler.batch_ready(): continue gt.stamp('sample') # ================ # Perform training. # ================ for i in range(self._n_train_repeat): batch = self._sampler.random_batch() # ==================================== # Perform training over all AC instances. # ==================================== for ains in self._alg_instances: ains[0]._do_training(iteration=t + epoch * self._epoch_length, batch=batch) # ================================================= # Perform training of the action-selection Q-function. # ================================================= # Set up the feed dictionary. feed_dict = { self._observations_ens_ph: batch['observations'], self._obv_act_ph: batch['actions'], self._observations_ens_next_ph: batch['next_observations'], self._rewards_ph: batch['rewards'], self._terminals_ph: batch['terminals'], } for i, ains in enumerate(self._alg_instances): with ains[0].policy.deterministic( self._eval_deterministic): feed_dict[self._acts_next_phs[i]] = ains[ 0].policy.get_actions( batch['next_observations']) # Perform training on the action-selection Q-function. self._sess.run(self._q_ens_train_operator, feed_dict) gt.stamp('train') # ============================================================ # Perform evaluation after one full epoch of training is completed. # ============================================================ if self._eval_n_episodes < 1: continue if self._evaluation_strategy == 'ensemble': # Use a whole ensemble of AC instances for evaluation. paths = rollouts(self._eval_env, self, self._sampler._max_path_length, self._eval_n_episodes) elif self._evaluation_strategy == 'best-policy': # Choose the AC instance with the highest observed performance so far for evaluation. eval_alg_ins = max(self._alg_instances, key=operator.itemgetter(1)) with eval_alg_ins[0].policy.deterministic( self._eval_deterministic): paths = rollouts(self._eval_env, eval_alg_ins[0].policy, self._sampler._max_path_length, self._eval_n_episodes) else: paths = None if paths is not None: total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) # Produce log info after each episode of training and evaluation. times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self._sampler.log_diagnostics() logger.dump_tabular() # logger.pop_prefix() gt.stamp('eval') # Terminate the sampler after the training process is completed. self._sampler.terminate()
def main(): # Parse input parameters args, unknown_args = parser.parse_known_args() args.num_steps = int(args.num_steps) unknown_args = parse_cmdline_kwargs(unknown_args) # Load config file load_yaml_config(args, 'learner') # Expose socket to actor(s) context = zmq.Context() weights_socket = context.socket(zmq.PUB) weights_socket.bind(f'tcp://*:{args.param_port}') _, agent = init_components(args, unknown_args) # Configure experiment directory create_experiment_dir(args, 'LEARNER-') save_yaml_config(args.exp_path / 'config.yaml', args, 'learner', agent) args.log_path = args.exp_path / 'log' args.ckpt_path = args.exp_path / 'ckpt' args.ckpt_path.mkdir() args.log_path.mkdir() logger.configure(str(args.log_path)) # Record commit hash with open(args.exp_path / 'hash', 'w') as f: f.write( str( subprocess.run('git rev-parse HEAD'.split(), stdout=subprocess.PIPE).stdout.decode('utf-8'))) # Variables to control the frequency of training receiving_condition = multiprocessing.Condition() num_receptions = multiprocessing.Value('i', 0) # Start memory pool in another process manager = MemPoolManager() manager.start() mem_pool = manager.MemPool(capacity=args.pool_size) Process(target=recv_data, args=(args.data_port, mem_pool, receiving_condition, num_receptions, args.keep_training)).start() # Print throughput statistics Process(target=MultiprocessingMemPool.record_throughput, args=(mem_pool, args.record_throughput_interval)).start() freq = 0 learn_flag = 0 while True: if learn_flag == 0: weights_socket.send(pickle.dumps(agent.get_weights())) if len(mem_pool) >= args.batch_size: # Sync weights to actor weights = agent.get_weights() if hvd.rank() == 0: weights_socket.send(pickle.dumps(weights)) if freq % args.ckpt_save_freq == 0: if args.ckpt_save_type == 'checkpoint': agent.save(args.ckpt_path / 'ckpt') elif args.ckpt_save_type == 'weight': with open(args.ckpt_path / 'weight.ckpt', 'wb') as f: pickle.dump(weights, f) if args.keep_training: agent.learn(mem_pool.sample(size=args.batch_size)) else: with receiving_condition: while num_receptions.value < args.training_freq: receiving_condition.wait() data = mem_pool.sample(size=args.batch_size) num_receptions.value -= args.training_freq # Training stat = agent.learn(data) learn_flag = 1 if stat is not None: for k, v in stat.items(): logger.record_tabular(k, v) logger.dump_tabular() freq += 1
def train(env_name, num_episodes, gamma, lam, kl_targ, batch_size, eval_freq, hid1_mult, init_policy_logvar, seed): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch eval_freq: number of training batch before test hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) init_policy_logvar: natural log of initial policy variance seed: random seed for all modules with randomness """ # set seeds set_global_seed(seed) # configure log configure_log_info(env_name, seed) # create env env = gym.make(env_name) env.seed(seed) # set env seed obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # create scaler scaler = Scaler(obs_dim) # create policy policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, init_policy_logvar).to(device) # create value_function value_function = ValueFunction(obs_dim, hid1_mult).to(device) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, episodes=5) # train & test models num_iteration = num_episodes // eval_freq current_episodes = 0 current_steps = 0 for iter in range(num_iteration): # train models for i in range(eval_freq): # rollout trajectories, steps = run_policy(env, policy, scaler, episodes=batch_size) # process data current_episodes += len(trajectories) current_steps += steps add_value(trajectories, value_function) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) train_returns = [np.sum(t["rewards"]) for t in trajectories] logger.info('[train] average return:{0}, std return: {1}'.format(np.mean(train_returns), np.std(train_returns))) # add various stats to training log: #log_batch_stats(observes, actions, advantages, disc_sum_rew) # update policy policy.update(observes, actions, advantages) # update policy # update value function value_function.update(observes, disc_sum_rew) # update value function # test models num_test_episodes = 10 trajectories, _ = run_policy(env, policy, scaler, episodes=num_test_episodes) avg_return = np.mean([np.sum(t["rewards"]) for t in trajectories]) std_return = np.std([np.sum(t["rewards"]) for t in trajectories]) logger.record_tabular('iteration', iter) logger.record_tabular('episodes', current_episodes) logger.record_tabular('steps', current_steps) logger.record_tabular('avg_return', avg_return) logger.record_tabular('std_return', std_return) logger.dump_tabular()
def main(): L.configure('/home/metalabadmin/exp/freeway', format_strs=['stdout', 'csv', 'tensorboard']) env = gym.make('Freeway-v0') env = wrapper.wrap_deepmind(env, frame_stack=True, scale=True) optimizer = tf.train.AdamOptimizer(learning_rate=0.0001) network = Q_network(env.observation_space, env.action_space.n, optimizer, gamma=0.99, scope='freeway') m_controller = MetaController(network, env.action_space.n) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(0.1 * 1e7), initial_p=1.0, final_p=0.02) replay = ReplayBuffer(50000) # get default tf_session sess = U.get_session() U.initialize() sess.run(m_controller.network.update_target_op) step = 0 episodes = 0 rewards = 0 mean_100ep_reward = 0 total_reward = [] saved_mean_reward = None ob = env.reset() while step <= 1e7: ep = exploration.value(step) ob_reshaped = np.reshape(ob, (1, ) + env.observation_space.shape) act = m_controller.sample_act(sess, ob_reshaped, update_eps=ep)[0] ob_tp1, reward_t, done_t, info = env.step(act) env.render() rewards += reward_t replay.add(ob, act, reward_t, ob_tp1, float(done_t)) ob = ob_tp1 # train every 4 steps if step >= 1000 and step % 4 == 0: obs, acts, rewards_t, obs_tp1, dones_t = replay.sample(64) weights, batch_idxes = np.ones_like(rewards_t), None # get q estimate for tp1 as 'supervised' obs_tp1_reshaped = np.reshape(obs_tp1, (64, ) + env.observation_space.shape) q_tp1 = m_controller.get_q(sess, obs_tp1_reshaped)[0] td_error = m_controller.train(sess, obs, acts, rewards_t, obs_tp1, dones_t, weights, q_tp1) step += 1 if step >= 1000 and step % 1000 == 0: sess.run(m_controller.network.update_target_op) if done_t: ob = env.reset() total_reward.append(rewards) episodes += 1 rewards = 0 print('step %d done %s, ep %.2f' % (step, str(done_t), ep)) mean_100ep_reward = round(np.mean(total_reward[-101:-1]), 1) if episodes % 10 == 0 and episodes != 0: print('date time %s' % str(datetime.now())) L.record_tabular("steps", step) L.record_tabular("episodes", episodes) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if step % 1000 == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward)) U.save_variables('./freewaymodel.ckpt') model_saved = True saved_mean_reward = mean_100ep_reward
# Initialize environment and reward type env = gym.make(args['gym_env'], reward_type=args['reward_type'], set_additional_goal=args['set_additional_goal']) # Set random seed in hope to reproductability env.seed(args['seed']) np.random.seed(args['seed']) tf.set_random_seed(args['seed']) logger.record_tabular("algo", args['algo']) logger.record_tabular("env", args['gym_env']) logger.record_tabular("env.set_additional_goal", env.set_additional_goal) logger.record_tabular("env.reward_type", env.reward_type) logger.dump_tabular() if args['algo'] == "ppo": # Make necessary directories maybe_mkdir(args['RUN_DIR']) maybe_mkdir(args['MODEL_DIR']) maybe_mkdir(args['FIGURE_DIR']) maybe_mkdir(args['RESULT_DIR']) ppo_params_json = os.environ[ 'PROJ_HOME_3'] + '/ppo1/ppo_params.json' # Start to train the policy from scratch # trained_policy = run(env=env, algorithm=ppo, params=ppo_params_json, args=args) # trained_policy.save_model(args['MODEL_DIR']) # Load model and continue training
def learn( env, policy_fn, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant' # annealing for stepsize parameters (epsilon and adam) ): # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed cliping parameter epislon ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = -tf.reduce_mean(tf.minimum( surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum( [max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def fit( env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None ): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model model = DeepDQN() sess = model.init_session().__enter__() # capture the shape outside the closure so that the env object is # not serialized by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, train, update_target, debug = model.build_train( make_obs_ph, q_func, env.action_space.n, tf.train.AdamOptimizer(learning_rate=lr), 10, gamma, param_noise ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule( schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. model.init_vars() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: model.load_state(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence # between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with # eps = exploration.value(t). See Appendix C.1 in # Parameter Space Noise for Exploration, Plappert et # al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = \ update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act( np.array(obs)[None], update_eps=update_eps, **kwargs )[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t) ) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = \ replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train( obses_t, actions, rewards, obses_tp1, dones, weights ) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities( batch_idxes, new_priorities ) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}". format(saved_mean_reward, mean_100ep_reward) ) model.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward) ) model.load_state(model_file) return act
def run(args): prefix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") logger.configure(f'logs/{args["dataset"]}/nn/{prefix}') logger.info(args) pool = mp.Pool(mp.cpu_count()) nn_arg = args.copy() nn_arg.update(find_best_params(nn_arg)) nn_arg.update(find_best_alpha_val(nn_arg)) logger.record_tabular('[PEER] batchsize', nn_arg['batchsize']) logger.record_tabular('[PEER] learning rate', nn_arg['lr']) logger.record_tabular('[PEER] hidsize', nn_arg['hidsize']) logger.record_tabular('[PEER] alpha', nn_arg['alpha']) logger.dump_tabular() nn_arg['seed'] = 1 run_nn_dmi(nn_arg) results_dmi = pool.map(run_nn_dmi, make_arg_list(nn_arg)) results_surr = pool.map(run_nn_surr, make_arg_list(nn_arg)) results_nn = pool.map(run_nn, make_arg_list(nn_arg)) results_peer = pool.map(run_nn_peer, make_arg_list(nn_arg)) results_symm = pool.map(run_nn_symm, make_arg_list(nn_arg)) pool.close() pool.join() test_acc_bce = [res['val_acc'] for res in results_nn] test_acc_peer = [res['val_acc'] for res in results_peer] test_acc_surr = [res['val_acc'] for res in results_surr] test_acc_symm = [res['val_acc'] for res in results_symm] test_acc_dmi = [res['val_acc'] for res in results_dmi] plot([ test_acc_bce, test_acc_peer, test_acc_surr, test_acc_symm, test_acc_dmi ], [ 'cross entropy loss', 'peer loss', 'surrogate loss', 'symmtric loss', 'dmi loss' ], title='Accuracy During Testing', path=f'logs/{args["dataset"]}/nn/{prefix}') train_acc_bce = [res['train_acc'] for res in results_nn] train_acc_peer = [res['train_acc'] for res in results_peer] train_acc_surr = [res['train_acc'] for res in results_surr] train_acc_symm = [res['train_acc'] for res in results_symm] train_acc_dmi = [res['train_acc'] for res in results_dmi] plot([ train_acc_bce, train_acc_peer, train_acc_surr, train_acc_symm, train_acc_dmi ], [ 'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss', 'dmi loss' ], title='Accuracy During Training', path=f'logs/{args["dataset"]}/nn/{prefix}') loss_acc_surr = [res['loss'] for res in results_surr] loss_acc_bce = [res['loss'] for res in results_nn] loss_acc_peer = [res['loss'] for res in results_peer] loss_acc_symm = [res['loss'] for res in results_symm] loss_acc_dmi = [res['loss'] for res in results_dmi] plot([ loss_acc_bce, loss_acc_peer, loss_acc_surr, loss_acc_symm, loss_acc_dmi ], [ 'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss', 'dmi loss' ], title='Loss', path=f'logs/{args["dataset"]}/nn/{prefix}') logger.record_tabular('[NN] with peer loss', np.mean(test_acc_peer, 0)[-1]) logger.record_tabular('[NN] with surrogate loss', np.mean(test_acc_surr, 0)[-1]) logger.record_tabular('[NN] with symmetric loss', np.mean(test_acc_symm, 0)[-1]) logger.record_tabular('[NN] with dmi loss', np.mean(test_acc_dmi, 0)[-1]) logger.record_tabular(f'[NN] with {args["loss"]} loss', np.mean(test_acc_bce, 0)[-1]) logger.dump_tabular()
def run_one_actor(index, args, unknown_args, actor_status): import tensorflow.compat.v1 as tf from tensorflow.keras.backend import set_session # Set 'allow_growth' config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # Connect to learner context = zmq.Context() context.linger = 0 # For removing linger behavior socket = context.socket(zmq.REQ) socket.connect(f'tcp://{args.ip}:{args.data_port}') # Initialize environment and model instance env = get_env(args.env, args.num_envs, **unknown_args) model = get_model(env, args) # Configure logging only in one process if index == 0: logger.configure(str(args.log_path)) else: logger.configure(str(args.log_path), format_strs=[]) # Initialize values model_id = -1 episode_infos = deque(maxlen=100) num_episode = 0 state = env.reset() nupdates = args.num_steps // args.max_steps_per_update model_init_flag = 0 for update in range(1, nupdates + 1): # Update weights new_weights, model_id = find_new_weights(model_id, args.ckpt_path) if new_weights is not None: model.set_weights(new_weights) model_init_flag = 1 elif model_init_flag == 0: continue # Collect data mb_states, mb_actions, mb_rewards, mb_dones, mb_extras = [], [], [], [], [] start_time = time.time() for _ in range(args.max_steps_per_update): mb_states.append(state) # Sample action action, value, neglogp = model.forward(state) extra_data = {'value': value, 'neglogp': neglogp} state, reward, done, info = env.step(action) mb_actions.append(action) mb_rewards.append(reward) mb_dones.append(done) mb_extras.append(extra_data) for info_i in info: maybeepinfo = info_i.get('episode') if maybeepinfo: episode_infos.append(maybeepinfo) num_episode += 1 mb_states = np.asarray(mb_states, dtype=state.dtype) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) mb_actions = np.asarray(mb_actions) mb_dones = np.asarray(mb_dones, dtype=np.bool) # Adjust data format and send to learner data = prepare_training_data( model, [mb_states, mb_actions, mb_rewards, mb_dones, state, mb_extras]) socket.send(serialize(data).to_buffer()) socket.recv() send_data_interval = time.time() - start_time # Log information logger.record_tabular("steps", update * args.max_steps_per_update) logger.record_tabular("episodes", num_episode) logger.record_tabular( "mean 100 episode reward", round(np.mean([epinfo['reward'] for epinfo in episode_infos]), 2)) logger.record_tabular( "mean 100 episode length", round(np.mean([epinfo['length'] for epinfo in episode_infos]), 2)) logger.record_tabular("send data interval", send_data_interval) logger.record_tabular("send data fps", args.max_steps_per_update // send_data_interval) logger.record_tabular("total steps", nupdates * args.max_steps_per_update) logger.dump_tabular() actor_status[index] = 1
def fit( model, env, timesteps_per_batch, # what to train on max_kl, cg_iters, gamma, lam, # advantage estimation entcoeff=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_timesteps=0, max_episodes=0, max_iters=0, # time constraint callback=None): # Setup losses and stuff # ---------------------------------------- # nworkers = MPI.COMM_WORLD.Get_size() # rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) th_init = model.get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) model.set_from_flat(th_init) model.vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = model.traj_segment_generator(model.pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1 while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with model.timed("sampling"): seg = seg_gen.__next__() model.add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate if hasattr(model.pi, "ret_rms"): model.pi.ret_rms.update(tdlamret) if hasattr(model.pi, "ob_rms"): model.pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return model.allmean(model.compute_fvp(p, * fvpargs)) + cg_damping * p model.assign_old_eq_new( ) # set old parameter values to new parameter values with model.timed("computegrad"): *lossbefore, g = model.compute_lossandgrad(*args) lossbefore = model.allmean(np.array(lossbefore)) g = model.allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with model.timed("cg"): stepdir = conjugate_gradient(fisher_vector_product, g, cg_iters=cg_iters, verbose=model.rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = model.get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize model.set_from_flat(thnew) meanlosses = surr, kl, *_ = model.allmean( np.array(model.compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") model.set_from_flat(thbefore) if model.nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), model.vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) for (lossname, lossval) in zip(model.loss_names, meanlosses): logger.record_tabular(lossname, lossval) with model.timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = model.allmean(model.compute_vflossandgrad(mbob, mbret)) model.vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(model.flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if model.rank == 0: logger.dump_tabular()
def fit(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear'): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space model = AcktrDiscrete(policy, ob_space, ac_space, nenvs, total_timesteps, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) # if save_interval and logger.get_dir(): # import cloudpickle # with open(os.path.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: # fh.write(cloudpickle.dumps(make_model)) # model = make_model() runner = Environment(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) \ and logger.get_dir(): savepath = os.path.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) env.close()
def _train(self, env, policy, initial_exploration_policy, pool): """Perform RL training. Args: env (`rllab.Env`): Environment used for training policy (`Policy`): Policy used for training initial_exploration_policy ('Policy'): Policy used for exploration If None, then all exploration is done using policy pool (`PoolBase`): Sample pool to add samples to """ self._init_training(env, policy, pool) if initial_exploration_policy is None: self.sampler.initialize(env, policy, pool) initial_exploration_done = True else: self.sampler.initialize(env, initial_exploration_policy, pool) initial_exploration_done = False with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): # logger.push_prefix() logger.log('Epoch #%d | ' % epoch) for t in range(self._epoch_length): # TODO.codeconsolidation: Add control interval to sampler if not initial_exploration_done: if self._epoch_length * epoch >= self._n_initial_exploration_steps: self.sampler.set_policy(policy) initial_exploration_done = True self.sampler.sample() if not self.sampler.batch_ready(): continue gt.stamp('sample') for i in range(self._n_train_repeat): self._do_training(iteration=t + epoch * self._epoch_length, batch=self.sampler.random_batch()) gt.stamp('train') self._evaluate(epoch) params = self.get_snapshot(epoch) # logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self.sampler.log_diagnostics() # logger.dump_tabular(with_prefix=False) logger.dump_tabular() # logger.pop_prefix() gt.stamp('eval') self.sampler.terminate()
def rollouts(self): # Prepare for rollouts # ---------------------------------------- seg_gen = self.traj_segment_generator(self.pi, self.env, self.timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards assert sum([ self.max_iters > 0, self.max_timesteps > 0, self.max_episodes > 0, self.max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if self.callback: self.callback(locals(), globals()) if self.max_timesteps and timesteps_so_far >= self.max_timesteps: break elif self.max_episodes and episodes_so_far >= self.max_episodes: break elif self.max_iters and iters_so_far >= self.max_iters: break elif self.max_seconds and time.time() - tstart >= self.max_seconds: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / self.max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() self.add_vtarg_and_adv(seg, self.gamma, self.lam) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], \ seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not self.pi.recurrent) optim_batchsize = self.optim_batchsize or ob.shape[0] if hasattr(self.pi, "ob_rms"): self.pi.ob_rms.update(ob) # update running mean/std for policy self.assign_old_eq_new( ) # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(self.optim_epochs): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = self.lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) self.adam.update(g, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in d.iterate_once(optim_batchsize): newlosses = self.loss(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, self.loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(self.flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return self.pi
def main(_): # create visualizer #visualizer = TensorboardVisualizer() monitor = Monitor(FLAGS) #log_dir = monitor.log_dir #visualizer.initialize(log_dir, None) saved_mean_reward = None # openAI logger L.configure(monitor.log_dir, format_strs=['stdout', 'csv']) # initialize env atari_env = AtariEnv(monitor) #screen_shot_subgoal(atari_env) # we should probably follow deepmind style env # stack 4 frames and scale float env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True) # get default tf_session sess = U.get_session() # create q networks for controller controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller') controller = Controller(controller_network, env.action_space.n) # create q networks for meta-controller num_goals = env.unwrapped.goals_space.n metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE) metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller') metacontroller = MetaController(metacontroller_network, num_goals) # Create the schedule for exploration starting from 1. exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps), initial_p=1.0, final_p=EXPLORATION_FINAL_EPS) # initialize experience replay controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE) metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE) # initialize critic critic = Critic(env.unwrapped) total_extrinsic_reward = [] # for success rate total_goal_reached = np.zeros(num_goals, dtype=np.int32) total_goal_sampled = np.zeros(num_goals, dtype=np.int32) total_goal_epsilon = np.ones(num_goals, dtype=np.float32) ep = 0 total_step = 0 init_ob = env.reset() U.initialize() # initialize target network in both controller and meta sess.run(metacontroller.network.update_target_op) sess.run(controller.network.update_target_op) # load ckpt if presence model_path = tf.train.latest_checkpoint(monitor.ckpt_dir) model_saved = False model_file = os.path.join(monitor.ckpt_dir, 'model') if model_path is not None: U.load_variables(model_file) L.log('loaded model from %s' % model_file) model_saved = True while ep < MAX_EPISODE: # count number of steps # init environment game play variables init_ob = env.reset() observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape) desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 # given predicted goal, we encode this goal bounding mask to the observation np array ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal) # NOTE: Below code verify added mask correctly # for i in range(ob_with_g.shape[-1]): # ob = ob_with_g[:,:,i] # image = Image.fromarray(ob) # image = image.convert('RGB') # image.save('test_%i.png' % i) done = False reached_goal = False while not done: extrinsic_rewards = 0 s0 = init_ob['observation'] while not (done or reached_goal): update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0] # obtain extrinsic reward from environment ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t) reached_goal = env.unwrapped.reached_goal(desired_goal) ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal) intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t) controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t) # sample from replay_buffer1 to train controller obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape) q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0] td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1) # join train meta-controller only sample from replay_buffer2 to train meta-controller if total_step >= WARMUP_STEPS: L.log('join train has started ----- step %d', total_step) # sample from replay_buffer2 to train meta-controller init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE) weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None # get q estimate for tp1 as 'supervised' obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape) q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0] td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1) if total_step % UPDATE_TARGET_NETWORK_FREQ == 0: #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step) sess.run(controller.network.update_target_op) # its fine, we aren't really training meta dqn until after certain steps. sess.run(metacontroller.network.update_target_op) extrinsic_rewards += extrinsic_reward_t ob_with_g = ob_with_g_tp1 done = done_t total_step += 1 # we are done / reached_goal # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2 # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards)) # clean observation without goal encoded metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done) # if we are here then we have finished the desired goal if not done: #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards)) exploration_ep = 1.0 total_goal_reached[env.unwrapped.achieved_goal] += 1 if total_step >= WARMUP_STEPS: t = total_step - WARMUP_STEPS exploration_ep = exploration2.value(t) ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape) while env.unwrapped.achieved_goal == desired_goal: desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0] env.unwrapped.desired_goal = desired_goal total_goal_sampled[desired_goal] += 1 L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal)) # start again reached_goal = False # finish an episode total_extrinsic_reward.append(extrinsic_rewards) ep += 1 mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1) if ep % monitor.print_freq == 0 : L.record_tabular("steps", total_step) L.record_tabular("episodes", ep) L.record_tabular("mean 100 episode reward", mean_100ep_reward) L.dump_tabular() if total_step % monitor.ckpt_freq == 0: if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: L.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward # verified our model was saved if model_saved: L.log('restored model with mean reward: %d' % saved_mean_reward) U.load_variables(model_file)
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation entcoeff=0.0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) args): # Setup losses and stuff` # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy # Ops to reassign params from new to old assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent newprob = tf.exp(pi.pd.logp(ac)) oldprob = tf.exp(oldpi.pd.logp(ac)) ratio = newprob / oldprob kl = pi.pd.kl(oldpi.pd) mean_kl = tf.reduce_mean(kl) get_kl = U.function([ob, ac], kl) get_mean_kl = U.function([ob, ac], mean_kl) threshold = kl < args.kl_threshold threshold = tf.cast(threshold, tf.float32) pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold pol_surr = tf.reduce_mean(pol_surr) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards running_scores = [] assert sum([ max_iters > 0, args.num_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if args.num_timesteps and timesteps_so_far >= args.num_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / args.num_timesteps, 0) else: raise NotImplementedError if MPI.COMM_WORLD.Get_rank() == 0: logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / ( atarg.std() + 1e-8) # standardized advantage function estimate optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Here we do a bunch of optimization epochs over the data for num_epoch in count(): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) g = np.nan_to_num(g) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) agg_mean_kl = get_mean_kl(ob, ac) if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs: break lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) rewbuffer.extend(rews) mean_score = None if rewbuffer: mean_score = np.mean(rewbuffer) running_scores.append((timesteps_so_far, mean_score)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 if MPI.COMM_WORLD.Get_rank() == 0: logger.record_tabular("EpRewMean", mean_score) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("NumEpoch", num_epoch) logger.dump_tabular() return running_scores
def fit(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): set_global_seeds(seed) model = A2C(policy=policy, observation_space=env.observation_space, action_space=env.action_space, nenvs=env.num_envs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) session = model.init_session() tf.global_variables_initializer().run(session=session) env_runner = Environment(env, model, nsteps=nsteps, gamma=gamma) nbatch = env.num_envs * nsteps tstart = time.time() writer = tf.summary.FileWriter('output', session.graph) for update in range(1, total_timesteps // nbatch + 1): tf.reset_default_graph() obs, states, rewards, masks, actions, values = env_runner.run(session) policy_loss, value_loss, policy_entropy = model.predict( observations=obs, states=states, rewards=rewards, masks=masks, actions=actions, values=values, session=session) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() env.close() writer.close() session.close()
def run_one_agent(index, args, unknown_args, actor_status): from tensorflow.keras.backend import set_session import tensorflow.compat.v1 as tf # Set 'allow_growth' config = tf.ConfigProto() config.gpu_options.allow_growth = True set_session(tf.Session(config=config)) # Connect to learner context = zmq.Context() context.linger = 0 # For removing linger behavior socket = context.socket(zmq.REQ) socket.connect(f'tcp://{args.ip}:{args.data_port}') # Initialize environment and agent instance env, agent = init_components(args, unknown_args) # Configure logging only in one process if index == 0: logger.configure(str(args.log_path)) save_yaml_config(args.exp_path / 'config.yaml', args, 'actor', agent) else: logger.configure(str(args.log_path), format_strs=[]) # Create local queues for collecting data transitions = [] # A list to store raw transitions within an episode mem_pool = MemPool() # A pool to store prepared training data # Initialize values model_id = -1 episode_rewards = [0.0] episode_lengths = [0] num_episodes = 0 mean_10ep_reward = 0 mean_10ep_length = 0 send_time_start = time.time() state = env.reset() for step in range(args.num_steps): # Do some updates agent.update_sampling(step, args.num_steps) # Sample action action, extra_data = agent.sample(state) next_state, reward, done, info = env.step(action) # Record current transition transitions.append( (state, action, reward, next_state, done, extra_data)) episode_rewards[-1] += reward episode_lengths[-1] += 1 state = next_state is_terminal = done or episode_lengths[-1] >= args.max_episode_length > 0 if is_terminal or len(mem_pool) + len( transitions) >= args.max_steps_per_update: # Current episode is terminated or a trajectory of enough training data is collected data = agent.prepare_training_data(transitions) transitions.clear() mem_pool.push(data) if is_terminal: # Log information at the end of episode num_episodes = len(episode_rewards) mean_10ep_reward = round(np.mean(episode_rewards[-10:]), 2) mean_10ep_length = round(np.mean(episode_lengths[-10:]), 2) episode_rewards.append(0.0) episode_lengths.append(0) # Reset environment state = env.reset() if len(mem_pool) >= args.max_steps_per_update: # Send training data after enough training data (>= 'arg.max_steps_per_update') is collected post_processed_data = agent.post_process_training_data( mem_pool.sample()) socket.send(serialize(post_processed_data).to_buffer()) socket.recv() mem_pool.clear() send_data_interval = time.time() - send_time_start send_time_start = time.time() if num_episodes > 0: # Log information logger.record_tabular("iteration", (step + 1) // args.max_steps_per_update) logger.record_tabular("steps", step) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("mean 10 episode reward", mean_10ep_reward) logger.record_tabular("mean 10 episode length", mean_10ep_length) logger.record_tabular( "send data fps", args.max_steps_per_update // send_data_interval) logger.record_tabular("send data interval", send_data_interval) logger.dump_tabular() # Update weights new_weights, model_id = find_new_weights(model_id, args.ckpt_path) if new_weights is not None: agent.set_weights(new_weights) actor_status[index] = 1
def train(env_name, start_episodes, num_episodes, gamma, tau, noise_std, batch_size, eval_freq, seed): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' start_episodes: how many episodes purely random policy is run for num_episodes: maximum number of episodes to run gamma: reward discount factor tau: target network update rate batch_size: number of episodes per policy training batch eval_freq: number of training batch before test seed: random seed for all modules with randomness """ # set seeds set_global_seed(seed) # configure log configure_log_info(env_name, seed) # create env env = gym.make(env_name) env.seed(seed) # set env seed obs_dim = env.observation_space.shape[0] obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) act_dim = env.action_space.shape[0] # create actor and target actor actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) target_actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device) # create critic and target critic critic = Critic(obs_dim, act_dim).to(device) target_critic = Critic(obs_dim, act_dim).to(device) # create DDPG agent (hollowed object) agent = DDPG(actor, critic, target_actor, target_critic, noise_std, gamma, tau) agent.align_target() # create replay_buffer replay_buffer = ReplayBuffer() # run a few episodes of untrained policy to initialize scaler and fill in replay buffer run_policy(env, agent, replay_buffer, mode="random", episodes=start_episodes) num_iteration = num_episodes // eval_freq current_episodes = 0 current_steps = 0 for iter in range(num_iteration): # train models for i in range(eval_freq): # sample transitions train_returns, total_steps = run_policy(env, agent, replay_buffer, mode="train", episodes=batch_size) current_episodes += batch_size current_steps += total_steps logger.info('[train] average return:{0}, std return: {1}'.format( np.mean(train_returns), np.std(train_returns))) # train num_epoch = total_steps // batch_size for e in range(num_epoch): observation, action, reward, next_obs, done = replay_buffer.sample( ) agent.update(observation, action, reward, next_obs, done) # test models num_test_episodes = 10 returns, _ = run_policy(env, agent, replay_buffer, mode="test", episodes=num_test_episodes) avg_return = np.mean(returns) std_return = np.std(returns) logger.record_tabular('iteration', iter) logger.record_tabular('episodes', current_episodes) logger.record_tabular('steps', current_steps) logger.record_tabular('avg_return', avg_return) logger.record_tabular('std_return', std_return) logger.dump_tabular()
def fit(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps, animate=False, callback=None, desired_kl=0.002): obfilter = ZFilter(env.observation_space.shape) max_pathlength = env.spec.timestep_limit stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize') inputs, loss, loss_sampled = policy.update_info optim = KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize * (1 - 0.9), momentum=0.9, kfac_update=2, epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1, weight_decay_dict=policy.wd_dict, max_grad_norm=None) pi_var_list = [] for var in tf.trainable_variables(): if "pi" in var.name: pi_var_list.append(var) update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list) do_update = Model().function(inputs, update_op) Model().init_vars() # start queue runners enqueue_threads = [] coord = tf.train.Coordinator() for qr in [q_runner, vf.q_runner]: assert (qr is not None) enqueue_threads.extend( qr.create_threads(tf.get_default_session(), coord=coord, start=True)) i = 0 timesteps_so_far = 0 while True: if timesteps_so_far > num_timesteps: break logger.log("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: path = rollout(env, policy, max_pathlength, animate=(len(paths) == 0 and (i % 10 == 0) and animate), obfilter=obfilter) paths.append(path) n = pathlength(path) timesteps_this_batch += n timesteps_so_far += n if timesteps_this_batch > timesteps_per_batch: break # Estimate advantage function vtargs = [] advs = [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vtargs.append(return_t) vpred_t = vf.predict(path) vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1]) delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1] adv_t = discount(delta_t, gamma * lam) advs.append(adv_t) # Update value function vf.fit(paths, vtargs) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) action_na = np.concatenate([path["action"] for path in paths]) oldac_dist = np.concatenate([path["action_dist"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) # Policy update do_update(ob_no, action_na, standardized_adv_n) min_stepsize = np.float32(1e-8) max_stepsize = np.float32(1e0) # Adjust stepsize kl = policy.compute_kl(ob_no, oldac_dist) if kl > desired_kl * 2: logger.log("kl too high") tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval() elif kl < desired_kl / 2: logger.log("kl too low") tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval() else: logger.log("kl just right!") logger.record_tabular( "EpRewMean", np.mean([path["reward"].sum() for path in paths])) logger.record_tabular( "EpRewSEM", np.std([ path["reward"].sum() / np.sqrt(len(paths)) for path in paths ])) logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logger.record_tabular("KL", kl) if callback: callback() logger.dump_tabular() i += 1 coord.request_stop() coord.join(enqueue_threads)