def train(self, cache, i=None): print( f"Training {self.policy.__class__} policy with key {self.policy.key}" ) logger.configure() sampler = Sampler(env=self.env, policy=self.policy) buffer = self.policy.make_training_buffer() nbatch = self.policy.config.training.nsteps * self.policy.config.training.nenvs i, extra_data = self.restore_training_checkpoint(cache=cache) total_timesteps = (i - 1) * nbatch total_episodes = extra_data.get('total_episodes', 0) epinfobuf = extra_data.get('epinfobuf', deque(maxlen=100)) log_freq = 1 while total_timesteps < self.policy.config.training.total_timesteps: batch = sampler.sample_batch(self.policy.config.training.nsteps) epinfobuf.extend(batch.env_info.epinfobuf) buffer.add_batch(batch) self.policy.train_step(buffer=buffer, itr=i, logger=logger, log_freq=log_freq, cache=cache, save_freq=None) if i % log_freq == 0: logger.logkv('itr', i) logger.logkv('cumulative episodes', total_episodes) logger.logkv('timesteps covered', total_timesteps) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('buffer size', buffer.time_shape.size) logger.dumpkvs() i += 1 total_episodes += len(batch.env_info.epinfobuf) total_timesteps += nbatch if i % int(self.policy.config.training.total_timesteps / (10 * nbatch)) == 0: print("Doing a cache roundtrip...") self.store_training_checkpoint(cache, itr=i, extra_data={ 'total_episodes': total_episodes, 'epinfobuf': epinfobuf }) stored_i, _ = self.restore_training_checkpoint(cache, itr=i) assert stored_i == i
def log_performance(self, i): logger.logkv('itr', i) logger.logkv('cumulative episodes', self.total_episodes) logger.logkv('timesteps covered', i * self.env.num_envs * self.batch_t) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in self.eval_epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in self.eval_epinfobuf])) logger.logkv('buffer size', self.buffer.time_shape.size) logger.logkv( 'memory used (GB)', psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024 * 1024)) logger.dumpkvs()
def print_log(*, model, run_info, batching_config, lossvals, update, fps, epinfobuf, tnow, tfirststart): ev = explained_variance(run_info.values, run_info.returns) logger.logkv("serial_timesteps", update * batching_config.nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * batching_config.nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs()
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn( network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) vf_coef: float, coefficient in front of value function loss in the total loss function (default: 0.5) ent_coef: float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) lrschedule: schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and returns fraction of the learning rate (specified as lr) as output epsilon: float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5) alpha: float, RMSProp decay parameter (default: 0.99) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs*nsteps # Start total timer tstart = time.time() for update in range(1, total_timesteps//nbatch+1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) nseconds = time.time()-tstart # Calculate the fps (frame per second) fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def mean_length(self): return ppo2.safemean([epinfo['l'] for epinfo in self._epinfobuf])
def mean_reward(self): return ppo2.safemean([epinfo['r'] for epinfo in self._epinfobuf])
def learn_ent_hoof_a2c(network, env, optimiser, seed=None, nsteps=5, total_timesteps=int(1e6), lr_upper_bound=None, ent_upper_bound=None, num_lr=None, num_ent_coeff=None, gamma=0.99, max_kl=None, max_grad_norm=0.5, log_interval=100, load_path=None, **network_kwargs): ''' Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. Parameters: ----------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See baselines.common/policies.py/lstm for more details on using recurrent nets in policies env: RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py) seed: seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible) nsteps: int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int, total number of timesteps to train on (default: 80M) max_gradient_norm: float, gradient is clipped to have global L2 norm no more than this value (default: 0.5) lr: float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4) gamma: float, reward discounting parameter (default: 0.99) log_interval: int, specifies how frequently the logs are printed out (default: 100) **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Ent_HOOF_Model(optimiser=optimiser, policy=policy, env=env, nsteps=nsteps, total_timesteps=total_timesteps, max_grad_norm=max_grad_norm) runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs * nsteps # model helper functions model_params = find_trainable_variables("a2c_model") get_flat = U.GetFlat(model_params) set_from_flat = U.SetFromFlat(model_params) def kl(new_mean, new_sd, old_mean, old_sd): approx_kl = np.log(new_sd / old_sd) + ( old_sd**2 + (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5 approx_kl = np.sum(approx_kl, axis=1) approx_kl = np.mean(approx_kl) return approx_kl if max_kl is None: # set max kl to a high val in case there is no constraint max_kl = 10**8 # Start total timer tstart = time.time() for update in range(1, int(total_timesteps // nbatch + 1)): opt_pol_val = -10**8 approx_kl = np.zeros((num_ent_coeff, num_lr)) epv = np.zeros((num_ent_coeff, num_lr)) rand_lr = lr_upper_bound * np.random.rand(num_lr) rand_lr = np.sort(rand_lr) rand_ent_coeff = ent_upper_bound * np.random.rand(num_ent_coeff) old_params = get_flat() rms_weights_before_upd = model.get_opt_state() obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run( ) epinfobuf.extend(epinfos) old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions) for nec in range(num_ent_coeff): # reset policy and rms prop optimiser set_from_flat(old_params) model.set_opt_state(rms_weights_before_upd) # get grads for loss fn with given entropy coeff policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values, rand_ent_coeff[nec]) new_params = get_flat() ent_grads = new_params - old_params # enumerate over different LR for nlr in range(num_lr): new_params = old_params + rand_lr[nlr] * ent_grads set_from_flat(new_params) new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll( obs, actions) lik_ratio = np.exp(-new_neg_ll + old_neg_ll) est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds, lik_ratio) approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd) epv[nec, nlr] = est_pol_val if (nec == 0 and nlr == 0) or (est_pol_val > opt_pol_val and approx_kl[nec, nlr] < max_kl): opt_pol_val = est_pol_val opt_pol_params = get_flat() opt_rms_wts = model.get_opt_state() opt_lr = rand_lr[nlr] opt_ent_coeff = rand_ent_coeff[nec] opt_kl = approx_kl[nec, nlr] # update policy and rms prop to optimal wts set_from_flat(opt_pol_params) model.set_opt_state(opt_rms_wts) # Shrink LR search space if too many get rejected rejections = np.sum(approx_kl > max_kl) / num_lr if rejections > 0.8: lr_upper_bound *= 0.8 if rejections == 0: lr_upper_bound *= 1.25 nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("opt_lr", float(opt_lr)) logger.record_tabular("ent_coeff", float(opt_ent_coeff)) logger.record_tabular("approx_kl", float(opt_kl)) logger.record_tabular("rejections", rejections) logger.record_tabular("lr_ub", lr_upper_bound) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn(*, network, env, total_timesteps, eval_env=None, seed=None, nsteps=2048, nbatch=None, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, mode='hippo', use_buffer=False, buffer_capacity=None, hindsight=0.5, reward_fn, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update nbatch: int batch size total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from mode switch between 'ppo' or 'hippo', default 'hippo' buffer_capacity max number of steps stored in the replay buffer hindsight fraction of the batch paths with hindsight reward_fn reward fuction used to recompute reward under a new goal **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space # Build policy # But first add shape & dtype attributes to the env's observation space (needed for building policy network) dtype = None size = 0 for key in ['observation', 'achieved_goal', 'desired_goal']: space = ob_space.spaces[key] shape = space.shape dtype = space.dtype size += np.prod(shape) if dtype is not None: assert space.dtype == dtype, 'dtype not same between observation spaces' ob_space.shape = (size, ) ob_space.dtype = dtype policy = build_policy(env, network, **network_kwargs) # Calculate the batch_size, nbatch is a rough approximation if nbatch is None: nbatch = nsteps * nenvs nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from baselines.ppo2.model import Model model_fn = Model model = model_fn(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps) if eval_env is not None: eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps) epinfobuf = deque(maxlen=100) if eval_env is not None: eval_epinfobuf = deque(maxlen=100) # Instantiate the replay buffer if use_buffer: if buffer_capacity is None: buffer_capacity = nbatch replay_buffer = ReplayBuffer(capacity=buffer_capacity) # Start total timer tfirststart = time.perf_counter() her_timesteps = 0 nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Collect new trajectories here paths, epinfos = runner.run() #pylint: disable=E0632 if eval_env is not None: eval_paths, eval_epinfos = eval_runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend('epinfos') if mode == 'hippo': batch_paths = [] if use_buffer: for path in paths: replay_buffer.insert(path) nsamples = 0 while nsamples < nbatch: path = replay_buffer.sample() subpath = random_subpath(path) if np.random.uniform() < hindsight: if len(subpath) == len(path): subpath.pop_step() subpath = apply_hindsight(path, reward_fn) batch_paths.append(subpath) nsamples += len(subpath) else: nsamples = 0 paths = itertools.cycle(paths) while nsamples < nbatch: path = next(paths) subpath = random_subpath(path) if np.random.uniform() < hindsight: if len(subpath) == len(path): subpath.pop_step() subpath = apply_hindsight(path, reward_fn) batch_paths.append(subpath) nsamples += len(subpath) elif mode == 'ppo': batch_paths = paths obs, returns, masks, actions, values, neglogpacs = batch(env, model, gamma, lam, batch_paths) _nbatch = (len(obs) // nbatch_train) * nbatch_train her_timesteps += _nbatch # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] # Index of each element of batch_size # Create the indices array inds = np.arange(_nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, _nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nsteps*nenvs) logger.logkv("total_steps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) return model
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
# Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", str(update)) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular('rewards', np.mean(rewards)) logger.record_tabular('values', np.mean(values)) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() # I dati per i grafici graph_data['policy_entropy'].append(float(policy_entropy)) graph_data['value_loss'].append(float(value_loss)) graph_data['policy_loss'].append(float(policy_loss)) graph_data['values_mean'].append(np.mean(values)) graph_data['values_min'].append(np.min(values)) graph_data['values_max'].append(np.max(values)) graph_data['values_std'].append(np.std(values)) graph_data['values_median'].append(np.median(values)) graph_data['rewards_mean'].append(np.mean(rewards)) graph_data['rewards_min'].append(np.min(rewards))
def learn(network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, load_path=None, **network_kwargs): ''' A2C 알고리즘에 대한 주 진입지점. `a2c` 알고리즘을 사용하여 주어진 환경에서 주어진 망으로 정책을 벼림한다. Parameters: ----------- network: 정책망 구조. 표준망 구조를 지정하는 문자열(mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small , conv_only - 전체목록을 보려면 baselines.common/models.py를 보라), 또는 입력으로 텐서플로우 텐서를 가지고 출력 텐서는 망 마지막단 출력쌍(output_tensor, extra_feed)을 반환하는 함수, , extra_feed 는 feed-forward 를 위해서는 None 다, 그리고 extra_feed 는 재사용신경망을 위한 망으로 상태를 사비하는 방법을 설명하는 목록(dictionary)이다. 정책에서 재사용신경망 사용에 대한 자세한 내용은 baselines.common/policies.py/lstm 을 보라. env: 강화각흡 환경. VecEnv(baselines.common/vec_env)와 비슷한 전달기를 구현하거나 DummyVecEnv(baselines.common/vec_env/dummy_vec_env.py)로 싸야 한다. seed: 알고리즘에서 뿌림수 순서를 복제하기 위한 씨알이다. 기본적으로 None 이다, 이것은 씨스템 노이즈생성기가 씨알임을 의미한다(복제하지 않는다) nsteps: int, 환경을 배열의 보수 마다 갱신한다(즉, 사리수(batch size)는 nsteps * nenv 이다 여기에서 nenv 는 병렬로 모사한 환경을 복사한 개수다.) total_timesteps: int, 벼림하기 위한 총 보수 (기본값: 80M) vf_coef: float, 총손실 함수에서 가치함수 손실 앞의 계수 (기본값: 0.5) ent_coef: float, 총손실 함수에서 정책 엔트로피 앞의 계수 (기본값: 0.01) max_gradient_norm: float, 기울기(gradient)는 전역(global) L2 보다 크지않은 값으로 제한(clipped)한다 (기본값: 0.5) lr: float, RMSProp 을 위한 벼림비(현재 구현은 RMSProp 에서 강제(hardcoded)한다) (기본값: 7e-4) lrschedule: 벼림비 계획. 'linear', 'constant', 또는 [0..1] -> [0..1] 함수로 할수 있다, 이것은 벼림진행의 일부를 입력으로 취하여 출력으로 벼림비(lr 로 지정) 부분을 반환한다. epsilon: float, RMSProp epsilon (RMSProp 갱신 분모로 제곱근 계산을 정상화 한다) (기본값: 1e-5) alpha: float, RMSProp 에누리 참여값(decay parameter) (기본값: 0.99) gamma: float, 포상 에누리 참여값(reward discounting parameter) (기본값: 0.99) log_interval: int, 얼마나 자주 기록을 인쇄하는지 지정한다 (기본값: 100) **network_kwargs: 정책/망 작성기에 대한 열쇄글 결정고유값(arguments). baselines.common/policies.py/build_policy와 망의 특정 유형에 대한 결정고유값(arguments)을 봐라. 예를들어, 'mlp' 망 구조는 num_hidden 와 num_layers 의 결정고유값(arguments)을 가진다. ''' set_global_seeds(seed) # 환경의 개수를 가져온다(Get the nb of env) nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # 모형개체 대리자 (step_model(표집모형) 와 train_model(벼림모형)을 생성한다) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) if load_path is not None: model.load(load_path) # 실행개체 대리자(Instantiate the runner object) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # 사리수(batch_size) 계산 nbatch = nenvs * nsteps # 전체타이머 시작 tstart = time.time() for update in range(1, total_timesteps // nbatch + 1): # 경험의 작은 덩이를 가져온다. Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart # fps 계산 (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model
def learn(network, env, seed, env_id=None, total_timesteps=int(40e6), gamma=0.99, log_interval=100, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, save_path=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): info_env = gym.make(env_id) algo = 'acktr' # wandb.init(project="floorplan_generator", name=algo) # wandb.config.algo = algo # # wandb.config.action_space = info_env.action_type # wandb.config.step_size = info_env.step_size #wandb.config.active_rewards = info_env.active_rewards #print("\n \n \n \n \n HI21 \n \n \n \n \n") if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) #print("\n \n \n \n \n HI22 \n \n \n \n \n") nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) # if save_interval and logger.get_dir(): # import cloudpickle # print(osp.join(logger.get_dir(), 'make_model.pkl')) # with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb+') as fh: # print(make_model) # fh.write(cloudpickle.dumps(make_model)) model = make_model() #print("\n \n \n \n \n HI23 \n \n \n \n \n") if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] #print("\n \n \n \n \n HI24 \n \n \n \n \n") for update in range(1, total_timesteps // nbatch + 1): #print("step1") obs, states, rewards, masks, actions, values, epinfos = runner.run() #print("step2") epinfobuf.extend(epinfos) #print("step3") policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) #print("step4") model.old_obs = obs #print("step5") nseconds = time.time() - tstart #print("step6") fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # images = env.get_images() # image = images[0] # writer.add_image('imresult', image, update, dataformats='HWC') ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() # wandb.log({'eprewmean': safemean([epinfo['r'] for epinfo in epinfobuf]), # 'eplenmean': safemean([epinfo['l'] for epinfo in epinfobuf])}) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) savepath = save_path print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn(network, env, seed=None, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, lambda_=0.1, margin=0.1, i_before=1, log_interval=100, load_path=None, **network_kwargs): set_global_seeds(seed) # Get the nb of env nenvs = env.num_envs policy = build_policy(env, network, **network_kwargs) # Instantiate the model object (that creates step_model and train_model) model = Model(policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, lambda_=lambda_, margin=margin) if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) # Calculate the batch_size nbatch = nenvs * nsteps # Start total timer tstart = time.time() obses_before: deque[np.ndarray] = deque(maxlen=i_before + 1) for update in range(1, total_timesteps // nbatch + 1): # Get mini batch of experiences obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) left_obs = [] has_left_obs = [] for start in range(0, nbatch, nsteps): result, has_obs = shift(obs[start:start + nsteps], i_before, fill_value=np.zeros_like(obs[0])) left_obs.append(result) has_left_obs.append(has_obs) left_obs = np.vstack(left_obs) has_left_obs = np.hstack(has_left_obs) right_obs = [] has_right_obs = [] for start in range(0, nbatch, nsteps): result, has_obs = shift(obs[start:start + nsteps], -1, fill_value=np.zeros_like(obs[0])) right_obs.append(result) has_right_obs.append(has_obs) right_obs = np.vstack(right_obs) has_right_obs = np.hstack(has_right_obs) has_triplet = np.logical_and(has_left_obs, has_right_obs).astype(float) policy_loss, value_loss, policy_entropy, repr_loss, delta_d = model.train( left_obs, obs, right_obs, states, rewards, masks, actions, values, has_triplet) nseconds = time.time() - tstart # Calculate the fps (frame per second) fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("repr_loss", float(repr_loss)) logger.record_tabular("delta_d", float(delta_d)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() return model