def _setup_critic_optimizer(self): """ setup the optimizer for the critic """ if self.verbose >= 2: logger.info('setting up critic optimizer') normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) if self.critic_l2_reg > 0.: critic_reg_vars = [var for var in tf_util.get_trainable_vars('model/qf/') if 'bias' not in var.name and 'output' not in var.name and 'b' not in var.name] if self.verbose >= 2: for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format(self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars ) self.critic_loss += critic_reg critic_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/')] critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if self.verbose >= 2: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = tf_util.flatgrad(self.critic_loss, tf_util.get_trainable_vars('model/qf/'), clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def apply_policy(ph_ob, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) mix_other_observations = [X] X = tf.concat(mix_other_observations, axis=1) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) additional_size = 448 X = activ(fc(X, 'fc_additional', nh=additional_size, init_scale=np.sqrt(2))) snext = tf.zeros((sy_nenvs, memsize)) mix_timeout = [X] Xtout = tf.concat(mix_timeout, axis=1) if extrahid: Xtout = X + activ(fc(Xtout, 'fc2val', nh=additional_size, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=additional_size, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def load_model(env, name = None): if name: filename = os.path.join(config.MODELDIR, env.name, name) if os.path.exists(filename): logger.info(f'Loading {name}') cont = True while cont: try: ppo_model = PPO1.load(filename, env=env) cont = False except Exception as e: time.sleep(5) print(e) else: raise Exception(f'\n{filename} not found') else: logger.info(f'Loading base PPO model') cont = True while cont: try: ppo_model = PPO1(get_network_arch(env.name), env=env) cont = False except Exception as e: time.sleep(5) print(e) return ppo_model
def verify(self, n=2000, eps=1e-4): buffer = OffPolicyBuffer(n, self.observation_space.shape, 1, self.action_space) state = self.reset() for _ in range(n): action = self.action_space.sample() next_state, reward, done, _ = self.step(action) mask = torch.tensor([0.0] if done else [1.0], dtype=torch.float32) buffer.insert(torch.tensor(state), torch.tensor(action), torch.tensor(reward), torch.tensor(next_state), torch.tensor(mask)) state = next_state if done: state = self.reset() rewards_, dones_ = self.mb_step(buffer.states.numpy(), buffer.actions.numpy(), buffer.next_states.numpy()) diff = (buffer.rewards.numpy() - rewards_[:, np.newaxis]) * buffer.masks.numpy() l_inf = np.abs(diff).max() logger.info('reward difference: %.6f', l_inf) assert np.allclose(dones_, buffer.masks), 'reward model is inaccurate' assert l_inf < eps, 'done model is inaccurate'
def get_perturbed_actor_updates(actor, perturbed_actor, param_noise_stddev, verbose=0): """ get the actor update, with noise. :param actor: (str) the actor :param perturbed_actor: (str) the pertubed actor :param param_noise_stddev: (float) the std of the parameter noise :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :return: (TensorFlow Operation) the update function """ # TODO: simplify this to this: # assert len(actor.vars) == len(perturbed_actor.vars) # assert len(actor.perturbable_vars) == len(perturbed_actor.perturbable_vars) assert len(tf_util.get_globals_vars(actor)) == len(tf_util.get_globals_vars(perturbed_actor)) assert len([var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]) == \ len([var for var in tf_util.get_trainable_vars(perturbed_actor) if 'LayerNorm' not in var.name]) updates = [] for var, perturbed_var in zip(tf_util.get_globals_vars(actor), tf_util.get_globals_vars(perturbed_actor)): if var in [var for var in tf_util.get_trainable_vars(actor) if 'LayerNorm' not in var.name]: if verbose >= 2: logger.info(' {} <- {} + noise'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var + tf.random_normal(tf.shape(var), mean=0., stddev=param_noise_stddev))) else: if verbose >= 2: logger.info(' {} <- {}'.format(perturbed_var.name, var.name)) updates.append(tf.assign(perturbed_var, var)) assert len(updates) == len(tf_util.get_globals_vars(actor)) return tf.group(*updates)
def __enter__(self): if self.tensorboard_log_path is not None: save_path = os.path.join(self.tensorboard_log_path, "{}_{}".format(self.tb_log_name, self._get_latest_run_id() + 1)) self.writer = tf.summary.FileWriter(save_path, graph=self.graph) logger.info('TF Logging to {} ...'.format(save_path)) return self.writer
def _setup_critic_optimizer(self): """ setup the optimizer for the critic """ if self.verbose >= 2: logger.info('setting up critic optimizer') ### BSS LOSS ### all_vars = [v for v in tf.global_variables()] self.l2_loss = 0.0 for var in all_vars: if 'qf' in var.name: self.l2_loss += tf.losses.mean_squared_error( tf.zeros(var.shape), var) _, qf_features = self.policy_tf.feature_matrices() singular_qf = tf.linalg.svd(qf_features, compute_uv=False) self.bss_loss = tf.reduce_sum(tf.square(singular_qf[-1])) ### BSS LOSS ### normalized_critic_target_tf = tf.clip_by_value( normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1]) self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf)) + \ self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss if self.critic_l2_reg > 0.: critic_reg_vars = [ var for var in tf_util.get_trainable_vars('model/qf/') if 'bias' not in var.name and 'qf_output' not in var.name and 'b' not in var.name ] if self.verbose >= 2: for var in critic_reg_vars: logger.info(' regularizing: {}'.format(var.name)) logger.info(' applying l2 regularization with {}'.format( self.critic_l2_reg)) critic_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.critic_l2_reg), weights_list=critic_reg_vars) self.critic_loss += critic_reg critic_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/qf/') ] critic_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in critic_shapes]) if self.verbose >= 2: logger.info(' critic shapes: {}'.format(critic_shapes)) logger.info(' critic params: {}'.format(critic_nb_params)) self.critic_grads = tf_util.flatgrad( self.critic_loss, tf_util.get_trainable_vars('model/qf/'), clip_norm=self.clip_norm) self.critic_optimizer = MpiAdam( var_list=tf_util.get_trainable_vars('model/qf/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement): #Dynamics loss with random features. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:,1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2) assert ac_one_hot.get_shape().ndims == 3 assert ac_one_hot.get_shape().as_list() == [None, None, self.ac_space.n], ac_one_hot.get_shape().as_list() ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n)) def cond(x): return tf.concat([x, ac_one_hot], 1) for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:,:-1] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:])) # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok? xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu(conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu(conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu(conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu(fc(cond(rgbrp), 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu(fc(cond(X_r_hat), 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(cond(X_r_hat), 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean(tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat)) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.)
def __init__(self, nenvs, nlumps): self.nenvs = nenvs self.nlumps = nlumps self.nenvs_per_lump = nenvs // nlumps self.acs = [[] for _ in range(nenvs)] self.int_rews = [[] for _ in range(nenvs)] self.ext_rews = [[] for _ in range(nenvs)] self.ep_infos = [{} for _ in range(nenvs)] self.filenames = [self.get_filename(i) for i in range(nenvs)] if MPI.COMM_WORLD.Get_rank() == 0: logger.info("episode recordings saved to ", self.filenames[0])
def step_wait(self): obs, rews, dones, infos = self.venv.step_wait() self.step_id += 1 if self.recording: self.video_recorder.capture_frame() self.recorded_frames += 1 if self.recorded_frames > self.video_length: logger.info("Saving video to ", self.video_recorder.path) self.close_video_recorder() elif self._video_enabled(): self.start_video_recorder() return obs, rews, dones, infos
def display_var_info(vars): from stable_baselines import logger count_params = 0 for v in vars: name = v.name if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue v_params = np.prod(v.shape.as_list()) count_params += v_params if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print logger.info(" %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(v.shape))) logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6))
def display_var_info(_vars): """ log variable information, for debug purposes :param _vars: ([TensorFlow Tensor]) the variables """ count_params = 0 for _var in _vars: name = _var.name if "/Adam" in name or "beta1_power" in name or "beta2_power" in name: continue v_params = np.prod(_var.shape.as_list()) count_params += v_params if "/b:" in name or "/biases" in name: continue # Wx+b, bias is not interesting to look at => count params, but not print logger.info(" %s%s %i params %s" % (name, " " * (55 - len(name)), v_params, str(_var.shape))) logger.info("Total model parameters: %0.2f million" % (count_params * 1e-6))
def define_self_prediction_rew(self, convfeat, rep_size, enlargement): #RND. # Random target network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xr = ph[:,1:] xr = tf.cast(xr, tf.float32) xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0, 5.0) xr = tf.nn.leaky_relu(conv(xr, 'c1r', nf=convfeat * 1, rf=8, stride=4, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu(conv(xr, 'c2r', nf=convfeat * 2 * 1, rf=4, stride=2, init_scale=np.sqrt(2))) xr = tf.nn.leaky_relu(conv(xr, 'c3r', nf=convfeat * 2 * 1, rf=3, stride=1, init_scale=np.sqrt(2))) rgbr = [to2d(xr)] X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2)) # Predictor network. for ph in self.ph_ob.values(): if len(ph.shape.as_list()) == 5: # B,T,H,W,C logger.info("CnnTarget: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) xrp = ph[:,1:] xrp = tf.cast(xrp, tf.float32) xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :, -1:] xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std, -5.0, 5.0) xrp = tf.nn.leaky_relu(conv(xrp, 'c1rp_pred', nf=convfeat, rf=8, stride=4, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu(conv(xrp, 'c2rp_pred', nf=convfeat * 2, rf=4, stride=2, init_scale=np.sqrt(2))) xrp = tf.nn.leaky_relu(conv(xrp, 'c3rp_pred', nf=convfeat * 2, rf=3, stride=1, init_scale=np.sqrt(2))) rgbrp = to2d(xrp) X_r_hat = tf.nn.relu(fc(rgbrp, 'fc1r_hat1_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = tf.nn.relu(fc(X_r_hat, 'fc1r_hat2_pred', nh=256 * enlargement, init_scale=np.sqrt(2))) X_r_hat = fc(X_r_hat, 'fc1r_hat3_pred', nh=rep_size, init_scale=np.sqrt(2)) self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1]) self.max_feat = tf.reduce_max(tf.abs(X_r)) self.int_rew = tf.reduce_mean(tf.square(tf.stop_gradient(X_r) - X_r_hat), axis=-1, keep_dims=True) self.int_rew = tf.reshape(self.int_rew, (self.sy_nenvs, self.sy_nsteps - 1)) noisy_targets = tf.stop_gradient(X_r) self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1) mask = tf.random_uniform(shape=tf.shape(self.aux_loss), minval=0., maxval=1., dtype=tf.float32) mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update, tf.float32) self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(tf.reduce_sum(mask), 1.)
def make_envs(env_id, do_eval, seed, conf, normalize_observations=False, normalize_returns=False): # Create envs. env_params = conf.pop('env_params', {}) env = base_env = gym.make(env_id) if hasattr(base_env, 'env'): base_env = base_env.env for attr in env_params: setattr(base_env, attr, env_params[attr]) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) # Seed everything to make things reproducible. logger.info('seed={}, logdir={}'.format(seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if normalize_observations or normalize_returns: env = DummyVecEnv([lambda: env]) env = VecNormalize(env, norm_obs=normalize_observations, norm_reward=normalize_returns) if do_eval: eval_env = base_eval_env = gym.make(env_id) if hasattr(base_eval_env, 'env'): base_eval_env = base_eval_env.env for attr in env_params: setattr(base_eval_env, attr, env_params[attr]) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'), allow_early_resets=True) eval_env.seed(seed) eval_env.base_env = base_eval_env else: base_eval_env = None eval_env = None env.base_env = base_env return base_env, env, base_eval_env, eval_env
def load_model(env, name): filename = os.path.join(config.MODELDIR, env.name, name) if os.path.exists(filename): logger.info(f'Loading {name}') cont = True while cont: try: ppo_model = PPO1.load(filename, env=env) cont = False except Exception as e: time.sleep(5) print(e) elif name == 'base.zip': cont = True while cont: try: rank = MPI.COMM_WORLD.Get_rank() if rank == 0: ppo_model = PPO1(get_network_arch(env.name), env=env) logger.info(f'Saving base.zip PPO model...') ppo_model.save( os.path.join(config.MODELDIR, env.name, 'base.zip')) else: ppo_model = PPO1.load(os.path.join(config.MODELDIR, env.name, 'base.zip'), env=env) cont = False except IOError as e: sys.exit(f'Permissions not granted on zoo/{env.name}/...') except Exception as e: print('Waiting for base.zip to be created...', e) time.sleep(2) else: raise Exception(f'\n{filename} not found') return ppo_model
def main(): """ Runs the test """ parser = atari_arg_parser() parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', help='Learning rate schedule') parser.add_argument('--sil-update', type=int, default=4, help="Number of updates per iteration") parser.add_argument('--sil-beta', type=float, default=0.1, help="Beta for weighted IS") parser.add_argument('--tensorboard-log', type=str, default='./sf_log/recons2') parser.add_argument('--tb', type=str, default='SIL_A2C') parser.add_argument('--use-sf', action='store_true') parser.add_argument('--use-recons', action='store_true') args = parser.parse_args() logger.configure(folder="{}/{}".format(args.tensorboard_log, args.tb)) logger.info('use SF {}'.format(args.use_sf)) train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lr_schedule=args.lr_schedule, num_env=16, sil_update=args.sil_update, sil_beta=args.sil_beta, use_sf=args.use_sf, use_recons=args.use_recons, tensorboard_log=args.tensorboard_log, tb_log_name=args.tb)
def get_target_updates(_vars, target_vars, tau, verbose=0): """Get target update operations. Parameters ---------- _vars : list of tf.Tensor the initial variables target_vars : list of tf.Tensor the target variables tau : float the soft update coefficient (keep old values, between 0 and 1) verbose : int the verbosity level: 0 none, 1 training information, 2 tensorflow debug Returns ------- tf.Operation initial update tf.Operation soft update """ if verbose >= 2: logger.info('setting up target updates ...') soft_updates = [] init_updates = [] assert len(_vars) == len(target_vars) for var, target_var in zip(_vars, target_vars): if verbose >= 2: logger.info(' {} <- {}'.format(target_var.name, var.name)) init_updates.append(tf.assign(target_var, var)) soft_updates.append( tf.assign(target_var, (1. - tau) * target_var + tau * var)) assert len(init_updates) == len(_vars) assert len(soft_updates) == len(_vars) return tf.group(*init_updates), tf.group(*soft_updates)
def main(): """ Runs the test """ parser = atari_arg_parser() parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn', help='Policy architecture') parser.add_argument('--peer', type=float, default=0., help='Coefficient of the peer term. (default: 0)') parser.add_argument('--note', type=str, default='test', help='Log path') parser.add_argument('--individual', action='store_true', default=False, help='If true, no co-training is applied.') parser.add_argument('--start-episode', type=int, default=0, help='Add peer term after this episode.') parser.add_argument('--end-episode', type=int, default=10000, help='Remove peer term after this episode.') parser.add_argument('--decay-type', type=str, default=None, choices=[None, 'inc', 'dec', 'inc_dec'], help='Decay type for alpha') parser.add_argument('--repeat', type=int, default=1, help='Repeat training on the dataset in one epoch') args = parser.parse_args() set_global_seeds(args.seed) logger.configure(os.path.join('logs', args.env, args.note)) logger.info(args) scheduler = Scheduler(args.start_episode, args.end_episode, decay_type=args.decay_type) train( args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, peer=args.peer, scheduler=scheduler, individual=args.individual, repeat=args.repeat, )
def run_gail(): parser = argparse.ArgumentParser() parser.add_argument('expert', type=str, default=None, help='Expert path (*.npz)') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--note', type=str, default='test') parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--num-steps', type=int, default=1000000) parser.add_argument('--policy', type=str, default='CnnPolicy', choices=[ 'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy', 'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy' ], help='Policy architecture') args = parser.parse_args() logger.configure(os.path.join('logs', args.env, args.note)) logger.info(args) if 'NoFrameskip' in args.env: env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4) else: import gym env = gym.make(args.env) dataset = ExpertDataset(expert_path=args.expert, batch_size=128, train_fraction=0.99, verbose=1) model = GAIL(args.policy, env, dataset, timesteps_per_batch=1280, verbose=1) model.learn(len(dataset.train_loader) * 1280)
def get_results(name, lagrangian_values, layer_values_list, perm_num): if perm_num == 1: lin_reg = get_linear_regressions_1_perm(lagrangian_values[name], layer_values_list) else: lin_reg = get_linear_regressions_2_perm(lagrangian_values[name], layer_values_list) best_lin_reg = [] for lin_l in lin_reg: if lin_l == []: best_lin_reg.append([]) else: best_lin_reg.append(lin_l[np.argmin(lin_l[:, 0])]) best_lin_reg = np.array(best_lin_reg) logger.info(f"dumping {perm_num} and {name}") lin_reg.dump(f"lin_reg_{perm_num}_{name}.txt") best_lin_reg.dump(f"best_lin_reg_{perm_num}_{name}.txt") return lin_reg, best_lin_reg
def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize, extrahid, sy_nenvs, sy_nsteps, pdparamsize, rec_gate_init): data_format = 'NHWC' ph = ph_ob assert len(ph.shape.as_list()) == 5 # B,T,H,W,C logger.info("CnnGruPolicy: using '%s' shape %s as image input" % (ph.name, str(ph.shape))) X = tf.cast(ph, tf.float32) / 255. X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:])) activ = tf.nn.relu yes_gpu = any(get_available_gpus()) with tf.variable_scope(scope, reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'): X = activ(conv(X, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), data_format=data_format)) X = activ(conv(X, 'c3', nf=64, rf=4, stride=1, init_scale=np.sqrt(2), data_format=data_format)) X = to2d(X) X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2))) X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize]) import pdb; pdb.set_trace() cell = GRUCell(memsize, rec_gate_init=rec_gate_init) cell.get_initial_state(ph_istate) my_rnn = keras.layers.RNN( cell, dtype=tf.float32, time_major=False) X, snext = my_rnn((X, ph_new[:,:,None])) X = tf.reshape(X, (-1, memsize)) Xtout = X if extrahid: Xtout = X + activ(fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1)) X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1)) pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01) vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01) vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01) pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize)) vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps)) vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps)) return pdparam, vpred_int, vpred_ext, snext
def get_target_updates(_vars, target_vars, tau, verbose=0): """ get target update operations :param _vars: ([TensorFlow Tensor]) the initial variables :param target_vars: ([TensorFlow Tensor]) the target variables :param tau: (float) the soft update coefficient (keep old values, between 0 and 1) :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug :return: (TensorFlow Operation, TensorFlow Operation) initial update, soft update """ if verbose >= 2: logger.info('setting up target updates ...') soft_updates = [] init_updates = [] assert len(_vars) == len(target_vars) for var, target_var in zip(_vars, target_vars): if verbose >= 2: logger.info(' {} <- {}'.format(target_var.name, var.name)) init_updates.append(tf.assign(target_var, var)) soft_updates.append(tf.assign(target_var, (1. - tau) * target_var + tau * var)) assert len(init_updates) == len(_vars) assert len(soft_updates) == len(_vars) return tf.group(*init_updates), tf.group(*soft_updates)
def _setup_param_noise(self, normalized_obs0): """ set the parameter noise operations :param normalized_obs0: (TensorFlow Tensor) the normalized observation """ assert self.param_noise is not None with tf.variable_scope("noise", reuse=False): self.perturbed_actor_tf = self.param_noise_actor.make_actor(normalized_obs0) with tf.variable_scope("noise_adapt", reuse=False): adaptive_actor_tf = self.adaptive_param_noise_actor.make_actor(normalized_obs0) with tf.variable_scope("noise_update_func", reuse=False): if self.verbose >= 2: logger.info('setting up param noise') self.perturb_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise/pi/', self.param_noise_stddev, verbose=self.verbose) self.perturb_adaptive_policy_ops = get_perturbed_actor_updates('model/pi/', 'noise_adapt/pi/', self.param_noise_stddev, verbose=self.verbose) self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))
def _on_step(self) -> bool: if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0: result = super(SelfPlayCallback, self)._on_step() #this will set self.best_mean_reward to the reward from the evaluation as it's previously -np.inf list_of_rewards = MPI.COMM_WORLD.allgather(self.best_mean_reward) av_reward = np.mean(list_of_rewards) std_reward = np.std(list_of_rewards) av_timesteps = np.mean(MPI.COMM_WORLD.allgather(self.num_timesteps)) total_episodes = np.sum(MPI.COMM_WORLD.allgather(self.n_eval_episodes)) if self.callback is not None: rules_based_rewards = MPI.COMM_WORLD.allgather(self.callback.best_mean_reward) av_rules_based_reward = np.mean(rules_based_rewards) rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.info("Eval num_timesteps={}, episode_reward={:.2f} +/- {:.2f}".format(self.num_timesteps, av_reward, std_reward)) logger.info("Total episodes ran={}".format(total_episodes)) #compare the latest reward against the threshold if result and av_reward > self.threshold: self.generation += 1 if rank == 0: #write new files logger.info(f"New best model: {self.generation}\n") generation_str = str(self.generation).zfill(5) av_rewards_str = str(round(av_reward,3)) if self.callback is not None: av_rules_based_reward_str = str(round(av_rules_based_reward,3)) else: av_rules_based_reward_str = str(0) source_file = os.path.join(config.TMPMODELDIR, f"best_model.zip") # this is constantly being written to - not actually the best model target_file = os.path.join(self.model_dir, f"_model_{generation_str}_{av_rules_based_reward_str}_{av_rewards_str}_{str(self.base_timesteps + self.num_timesteps)}_.zip") copyfile(source_file, target_file) target_file = os.path.join(self.model_dir, f"best_model.zip") copyfile(source_file, target_file) # if playing against a rules based agent, update the global best reward to the improved metric if self.opponent_type == 'rules': self.threshold = av_reward #reset best_mean_reward because this is what we use to extract the rewards from the latest evaluation by each agent self.best_mean_reward = -np.inf if self.callback is not None: #if evaling against rules-based agent as well, reset this too self.callback.best_mean_reward = -np.inf return True
def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) actor_shapes = [var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/')] actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = tf_util.flatgrad(self.actor_loss, tf_util.get_trainable_vars('model/pi/'), clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam(var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def _setup_actor_optimizer(self): """ setup the optimizer for the actor """ if self.verbose >= 2: logger.info('setting up actor optimizer') ### BSS LOSS ### all_vars = [v for v in tf.global_variables()] self.l2_loss = 0.0 for var in all_vars: if 'pi' in var.name: self.l2_loss += tf.losses.mean_squared_error( tf.zeros(var.shape), var) pi_features, _ = self.policy_tf.feature_matrices() singular_pi = tf.linalg.svd(pi_features, compute_uv=False) self.bss_loss = tf.reduce_sum(tf.square(singular_pi[-1])) ### BSS LOSS ### self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf) + \ self.bss_coef * self.bss_loss + self.l2_coef * self.l2_loss actor_shapes = [ var.get_shape().as_list() for var in tf_util.get_trainable_vars('model/pi/') ] actor_nb_params = sum( [reduce(lambda x, y: x * y, shape) for shape in actor_shapes]) if self.verbose >= 2: logger.info(' actor shapes: {}'.format(actor_shapes)) logger.info(' actor params: {}'.format(actor_nb_params)) self.actor_grads = tf_util.flatgrad( self.actor_loss, tf_util.get_trainable_vars('model/pi/'), clip_norm=self.clip_norm) self.actor_optimizer = MpiAdam( var_list=tf_util.get_trainable_vars('model/pi/'), beta1=0.9, beta2=0.999, epsilon=1e-08)
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DDPG", \ reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) self.episode_reward = np.zeros((1, )) episode_successes = [] with self.sess.as_default(), self.graph.as_default(): # Prepare everything. self._reset() obs = self.env.reset() eval_obs = None if self.eval_env is not None: eval_obs = self.eval_env.reset() episode_reward = 0. episode_step = 0 episodes = 0 step = 0 total_steps = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] eval_episode_rewards = [] eval_qs = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 epoch = 0 while True: for _ in range(log_interval): # Perform rollouts. for _ in range(self.nb_rollout_steps): if total_steps >= total_timesteps: return self # Predict next action. action, q_value = self._policy(obs, apply_noise=True, compute_q=True) assert action.shape == self.env.action_space.shape # Execute next action. if rank == 0 and self.render: self.env.render() # Randomly sample actions from a uniform distribution # with a probabilty self.random_exploration (used in HER + DDPG) if np.random.rand() < self.random_exploration: rescaled_action = action = self.action_space.sample( ) else: rescaled_action = action * np.abs( self.action_space.low) rescaled_action = np.where(action)[0][0] new_obs, reward, done, info = self.env.step( rescaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) step += 1 total_steps += 1 self.num_timesteps += 1 if rank == 0 and self.render: self.env.render() episode_reward += reward episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q_value) self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if callback is not None: # Only stop training if return value is False, not when it is None. # This is for backwards compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: return self if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append( float(maybe_is_success)) self._reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(self.nb_train_steps): # Not enough samples in the replay buffer if not self.replay_buffer.can_sample( self.batch_size): break # Adapt param noise, if necessary. if len(self.replay_buffer) >= self.batch_size and \ t_train % self.param_noise_adaption_interval == 0: distance = self._adapt_param_noise() epoch_adaptive_distances.append(distance) # weird equation to deal with the fact the nb_train_steps will be different # to nb_rollout_steps step = (int(t_train * (self.nb_rollout_steps / self.nb_train_steps)) + self.num_timesteps - self.nb_rollout_steps) critic_loss, actor_loss = self._train_step( step, writer, log=t_train == 0) epoch_critic_losses.append(critic_loss) epoch_actor_losses.append(actor_loss) self._update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if self.eval_env is not None: eval_episode_reward = 0. for _ in range(self.nb_eval_steps): if total_steps >= total_timesteps: return self eval_action, eval_q = self._policy( eval_obs, apply_noise=False, compute_q=True) eval_obs, eval_r, eval_done, _ = self.eval_env.step( eval_action * np.abs(self.action_space.low)) if self.render_eval: self.eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: if not isinstance(self.env, VecEnv): eval_obs = self.eval_env.reset() eval_episode_rewards.append( eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean( epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean( episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean( epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean( epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean( epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean( epoch_critic_losses) if len(epoch_adaptive_distances) != 0: combined_stats['train/param_noise_distance'] = np.mean( epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( step) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std( epoch_actions) # Evaluation statistics. if self.eval_env is not None: combined_stats['eval/return'] = np.mean( eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_history) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len( eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(self.env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as file_handler: pickle.dump(self.env.get_state(), file_handler) if self.eval_env and hasattr(self.eval_env, 'get_state'): with open( os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as file_handler: pickle.dump(self.eval_env.get_state(), file_handler)
def learn(self, total_timesteps, callback=None, vae=None, skip_episodes=5, tb_log_name="DDPG"): rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) self.episode_reward = np.zeros((1, )) with self.sess.as_default(), self.graph.as_default(): print(self.sess._config) # Prepare everything. self._reset() episode_reward = 0. episode_step = 0 episodes = 0 step = 0 total_steps = 0 start_time = time.time() actor_losses = [] critic_losses = [] while True: obs = self.env.reset() # Rollout one episode. while True: if total_steps >= total_timesteps: return self # Predict next action. action, q_value = self._policy(obs, apply_noise=True, compute_q=True) print(action) assert action.shape == self.env.action_space.shape # Execute next action. if rank == 0 and self.render: self.env.render() new_obs, reward, done, _ = self.env.step( action * np.abs(self.action_space.low)) step += 1 total_steps += 1 if rank == 0 and self.render: self.env.render() episode_reward += reward episode_step += 1 # Book-keeping. # Do not record observations, while we skip DDPG training. if (episodes + 1) > skip_episodes: self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if callback is not None: callback(locals(), globals()) if done: print("episode finished. Reward: ", episode_reward) # Episode done. episode_reward = 0. episode_step = 0 episodes += 1 self._reset() obs = self.env.reset() # Finish rollout on episode finish. break print("rollout finished") # Train VAE. train_start = time.time() vae.optimize() print("VAE training duration:", time.time() - train_start) # Train DDPG. actor_losses = [] critic_losses = [] train_start = time.time() if episodes > skip_episodes: for t_train in range(self.nb_train_steps): critic_loss, actor_loss = self._train_step( 0, None, log=t_train == 0) critic_losses.append(critic_loss) actor_losses.append(actor_loss) self._update_target_net() print("DDPG training duration:", time.time() - train_start) mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['train/loss_actor'] = np.mean(actor_losses) combined_stats['train/loss_critic'] = np.mean( critic_losses) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( step) / float(duration) combined_stats['total/episodes'] = episodes def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) combined_stats_sums = MPI.COMM_WORLD.allreduce( np.array( [as_scalar(x) for x in combined_stats.values()])) combined_stats = { k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums) } # Total statistics. combined_stats['total/steps'] = step for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('')
def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): """ run the training of DDPG :param env_id: (str) the environment ID :param seed: (int) the initial random seed :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by seperating them with commas :param layer_norm: (bool) use layer normalization :param evaluation: (bool) enable evaluation of DDPG training :param kwargs: (dict) extra keywords for the training.train function """ # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(env_id) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) if evaluation and rank == 0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec( initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise( mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError( 'unknown noise type "{}"'.format(current_noise_type)) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. start_time = 0 if rank == 0: start_time = time.time() model = DDPG(policy=MlpPolicy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, memory_limit=int(1e6), layer_norm=layer_norm, verbose=2, **kwargs) model.learn(total_timesteps=10000) env.close() if eval_env is not None: eval_env.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def learn(self, total_timesteps, callback=None, seed=None, log_interval=None, tb_log_name="DDPG", reset_num_timesteps=True, replay_wrapper=None): new_tb_log = self._init_num_timesteps(reset_num_timesteps) if replay_wrapper is not None: self.replay_buffer = replay_wrapper(self.replay_buffer) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: # a list for tensorboard logging, to prevent logging with the same step number, if it already occured self.tb_seen_steps = [] # rank = MPI.COMM_WORLD.Get_rank() # we assume symmetric actions. assert np.all( np.abs(self.env.action_space.low) == self.env.action_space.high) if self.verbose >= 2: logger.log('Using agent with the following configuration:') logger.log(str(self.__dict__.items())) with self.sess.as_default(), self.graph.as_default(): # Prepare everything. self._reset() obs = self.env.reset() eval_obs = None if self.eval_env is not None: eval_obs = self.eval_env.reset() episode_rewards_deque = deque(maxlen=100) eval_episode_rewards_deque = deque(maxlen=100) self.episode_reward = np.zeros((1, )) episode_successes = [] episode_rewards_all = [] episode_steps_all = [] episode_reward = 0. episode_step = 0 total_steps = 0 step_since_eval = 0 total_episode_num = 0 start_time = time.time() while True: # Perform rollouts. qs_this_rollout_period = [] actions_this_rollout_period = [] while True: if total_steps >= total_timesteps: return self # Predict next action. if total_steps <= 10000: action = self.env.action_space.sample() q_value = 0 else: action, q_value = self._policy(obs, apply_noise=True, compute_q=True) assert action.shape == self.env.action_space.shape rescaled_action = action * np.abs( self.action_space.low) new_obs, reward, done, info = self.env.step( rescaled_action) if writer is not None: ep_rew = np.array([reward]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger( self.episode_reward, ep_rew, ep_done, writer, self.num_timesteps) total_steps += 1 self.num_timesteps += 1 episode_reward += reward episode_step += 1 step_since_eval += 1 # Book-keeping. actions_this_rollout_period.append(action) qs_this_rollout_period.append(q_value) self._store_transition(obs, action, reward, new_obs, done) obs = new_obs if done: # Episode done. episode_rewards_all.append(episode_reward) episode_rewards_deque.append(episode_reward) episode_steps_all.append(episode_step) episode_reward = 0. episode_step = 0 total_episode_num += 1 maybe_is_success = info.get('is_success') if maybe_is_success is not None: episode_successes.append( float(maybe_is_success)) self._reset() if not isinstance(self.env, VecEnv): obs = self.env.reset() break # Train. actor_losses_this_train_period = [] critic_losses_this_train_period = [] last_episode_step = int(episode_steps_all[-1]) for t_train in range(last_episode_step): # Not enough samples in the replay buffer if not self.replay_buffer.can_sample(self.batch_size): break # weird equation to deal with the fact the nb_train_steps will be different # to nb_rollout_steps step = total_steps - last_episode_step + t_train critic_loss, actor_loss = self._train_step( step, writer, do_actor_update=t_train % 2 == 0) critic_losses_this_train_period.append(critic_loss) if actor_loss: actor_losses_this_train_period.append(actor_loss) self._update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if self.eval_env is not None and step_since_eval >= self.eval_freq: step_since_eval %= self.eval_freq eval_episode_reward = 0. eval_episode = 0 while eval_episode < 10: eval_action, eval_q = self._policy( eval_obs, apply_noise=False, compute_q=True) eval_obs, eval_r, eval_done, _ = self.eval_env.step( eval_action * np.abs(self.action_space.low)) eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: if not isinstance(self.env, VecEnv): eval_obs = self.eval_env.reset() eval_episode_rewards.append( eval_episode_reward) eval_episode_rewards_deque.append( eval_episode_reward) eval_episode_reward = 0. eval_episode += 1 if callback is not None: # Only stop training if return value is False, not when it is None. # This is for backwards compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: return self # mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = self._get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = episode_rewards_all[-1] combined_stats['rollout/return_last_100'] = np.mean( episode_rewards_deque) combined_stats[ 'rollout/episode_steps'] = episode_steps_all[-1] combined_stats['debug/actions_mean'] = np.mean( actions_this_rollout_period) combined_stats['debug/actions_std'] = np.std( actions_this_rollout_period) combined_stats['debug/Q_mean'] = np.mean( qs_this_rollout_period) combined_stats['train/loss_actor'] = np.mean( actor_losses_this_train_period) combined_stats['train/loss_critic'] = np.mean( critic_losses_this_train_period) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float( total_steps) / float(duration) # Evaluation statistics. if self.eval_env is not None and eval_episode_rewards: combined_stats['eval/return'] = np.mean( eval_episode_rewards) combined_stats['eval/return_history'] = np.mean( eval_episode_rewards_deque) combined_stats['eval/Q'] = np.mean(eval_qs) combined_stats['eval/episodes'] = len( eval_episode_rewards) def as_scalar(scalar): """ check and return the input if it is a scalar, otherwise raise ValueError :param scalar: (Any) the object to check :return: (Number) the scalar if x is a scalar """ if isinstance(scalar, np.ndarray): assert scalar.size == 1 return scalar[0] elif np.isscalar(scalar): return scalar else: raise ValueError('expected scalar, got %s' % scalar) # combined_stats_sums = MPI.COMM_WORLD.allreduce( # np.array([as_scalar(x) for x in combined_stats.values()])) # combined_stats = {k: v / mpi_size for (k, v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/episodes'] = total_episode_num combined_stats['total/steps'] = total_steps for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if len(episode_successes) > 0: logger.logkv("success rate", np.mean(episode_successes[-100:])) logger.dump_tabular() logger.info('') logdir = logger.get_dir()