def fit(self, obs, act, obs_next, cp_obs, cp_act, future_bool, epochs=1000, compute_normalization=True, valid_split_ratio=None, rolling_average_persitency=None, verbose=False, log_tabular=False, max_logging=5000): assert obs.ndim == 2 and obs.shape[ 1] == self.obs_space_dims * self.future_length assert obs_next.ndim == 2 and obs_next.shape[ 1] == self.obs_space_dims * self.future_length assert act.ndim == 2 and act.shape[ 1] == self.action_space_dims * self.future_length assert cp_obs.ndim == 2 and cp_obs.shape[1] == (self.obs_space_dims * self.history_length) assert cp_act.ndim == 2 and cp_act.shape[1] == ( self.action_space_dims * self.history_length) assert future_bool.ndim == 2 and future_bool.shape[ 1] == self.future_length if valid_split_ratio is None: valid_split_ratio = self.valid_split_ratio if rolling_average_persitency is None: rolling_average_persitency = self.rolling_average_persitency assert 1 > valid_split_ratio >= 0 sess = tf.compat.v1.get_default_session() obs = obs.reshape(-1, self.obs_space_dims) obs_next = obs_next.reshape(-1, self.obs_space_dims) delta = self.env.targ_proc(obs, obs_next) back_delta = self.env.targ_proc(obs_next, obs) obs = obs.reshape(-1, self.future_length * self.obs_space_dims) obs_next = obs_next.reshape(-1, self.future_length * self.obs_space_dims) delta = delta.reshape(-1, self.future_length * self.obs_space_dims) back_delta = back_delta.reshape( -1, self.future_length * self.obs_space_dims) single_obs = obs[:, :self.obs_space_dims] single_act = act[:, :self.action_space_dims] single_delta = delta[:, :self.obs_space_dims] single_back_delta = back_delta[:, :self.obs_space_dims] if self._dataset is None: self._dataset = dict(obs=obs, act=act, delta=delta, cp_obs=cp_obs, cp_act=cp_act, future_bool=future_bool, obs_next=obs_next, back_delta=back_delta, single_obs=single_obs, single_act=single_act, single_delta=single_delta, single_back_delta=single_back_delta) else: self._dataset['obs'] = np.concatenate([self._dataset['obs'], obs]) self._dataset['act'] = np.concatenate([self._dataset['act'], act]) self._dataset['delta'] = np.concatenate( [self._dataset['delta'], delta]) self._dataset['cp_obs'] = np.concatenate( [self._dataset['cp_obs'], cp_obs]) self._dataset['cp_act'] = np.concatenate( [self._dataset['cp_act'], cp_act]) self._dataset['future_bool'] = np.concatenate( [self._dataset['future_bool'], future_bool]) self._dataset['obs_next'] = np.concatenate( [self._dataset['obs_next'], obs_next]) self._dataset['back_delta'] = np.concatenate( [self._dataset['back_delta'], back_delta]) self._dataset['single_obs'] = np.concatenate( [self._dataset['single_obs'], single_obs]) self._dataset['single_act'] = np.concatenate( [self._dataset['single_act'], single_act]) self._dataset['single_delta'] = np.concatenate( [self._dataset['single_delta'], single_delta]) self._dataset['single_back_delta'] = np.concatenate( [self._dataset['single_back_delta'], single_back_delta]) self.compute_normalization(self._dataset['single_obs'], self._dataset['single_act'], self._dataset['single_delta'], self._dataset['cp_obs'], self._dataset['cp_act'], self._dataset['single_back_delta']) dataset_size = self._dataset['obs'].shape[0] n_valid_split = min(int(dataset_size * valid_split_ratio), max_logging) permutation = np.random.permutation(dataset_size) train_obs, valid_obs = self._dataset['obs'][permutation[ n_valid_split:]], self._dataset['obs'][permutation[:n_valid_split]] train_act, valid_act = self._dataset['act'][permutation[ n_valid_split:]], self._dataset['act'][permutation[:n_valid_split]] train_delta, valid_delta = self._dataset['delta'][ permutation[n_valid_split:]], self._dataset['delta'][ permutation[:n_valid_split]] train_cp_obs, valid_cp_obs = self._dataset['cp_obs'][ permutation[n_valid_split:]], self._dataset['cp_obs'][ permutation[:n_valid_split]] train_cp_act, valid_cp_act = self._dataset['cp_act'][ permutation[n_valid_split:]], self._dataset['cp_act'][ permutation[:n_valid_split]] train_obs_next, valid_obs_next = self._dataset['obs_next'][ permutation[n_valid_split:]], self._dataset['obs_next'][ permutation[:n_valid_split]] train_future_bool, valid_future_bool = self._dataset['future_bool'][ permutation[n_valid_split:]], self._dataset['future_bool'][ permutation[:n_valid_split]] train_back_delta, valid_back_delta = self._dataset['back_delta'][ permutation[n_valid_split:]], self._dataset['back_delta'][ permutation[:n_valid_split]] train_obs, train_act, train_delta, train_obs_next, train_back_delta, train_cp_obs, train_cp_act = \ self._preprocess_inputs(train_obs, train_act, train_delta, train_cp_obs, train_cp_act, train_future_bool, train_obs_next, train_back_delta) if n_valid_split > 0: valid_obs, valid_act, valid_delta, valid_obs_next, valid_back_delta, valid_cp_obs, valid_cp_act = \ self._preprocess_inputs(valid_obs, valid_act, valid_delta, valid_cp_obs, valid_cp_act, valid_future_bool, valid_obs_next, valid_back_delta) valid_loss_rolling_average = None epoch_times = [] train_dataset_size = train_obs.shape[0] if self.ensemble_size > 1: bootstrap_idx = np.random.randint(0, train_dataset_size, size=(self.ensemble_size, train_dataset_size)) else: bootstrap_idx = np.tile( np.arange(train_dataset_size, dtype='int32'), (self.ensemble_size, 1)) valid_dataset_size = valid_obs.shape[0] valid_boostrap_idx = np.tile( np.arange(valid_dataset_size, dtype='int32'), (self.ensemble_size, 1)) def shuffle_rows(arr): idxs = np.argsort(np.random.uniform(size=arr.shape), axis=-1) return arr[np.arange(arr.shape[0])[:, None], idxs] """ ------- Looping over training epochs ------- """ for epoch in range(epochs): # preparations for recording training stats mse_losses, back_mse_losses, recon_losses = [], [], [] t0 = time.time() bootstrap_idx = shuffle_rows(bootstrap_idx) """ ------- Looping through the shuffled and batched dataset for one epoch -------""" for batch_num in range( int(np.ceil(bootstrap_idx.shape[-1] / self.batch_size))): batch_idxs = bootstrap_idx[:, batch_num * self.batch_size:(batch_num + 1) * self.batch_size] bootstrap_train_obs = train_obs[batch_idxs] bootstrap_train_act = train_act[batch_idxs] bootstrap_train_delta = train_delta[batch_idxs] bootstrap_train_obs_next = train_obs_next[batch_idxs] bootstrap_train_back_delta = train_back_delta[batch_idxs] bootstrap_train_cp_obs = train_cp_obs[batch_idxs] bootstrap_train_cp_act = train_cp_act[batch_idxs] feed_dict = self.get_feed_dict( bootstrap_train_obs, bootstrap_train_act, bootstrap_train_delta, bootstrap_train_obs_next, bootstrap_train_back_delta, bootstrap_train_cp_obs, bootstrap_train_cp_act) mse_loss, back_mse_loss, recon_loss, _ = sess.run( [ self.mse_loss, self.back_mse_loss, self.recon_loss, self.train_op ], feed_dict=feed_dict) mse_losses.append(mse_loss) back_mse_losses.append(back_mse_loss) recon_losses.append(recon_loss) """ ------- Validation -------""" if n_valid_split > 0: bootstrap_valid_obs = valid_obs[valid_boostrap_idx] bootstrap_valid_act = valid_act[valid_boostrap_idx] bootstrap_valid_delta = valid_delta[valid_boostrap_idx] bootstrap_valid_obs_next = valid_obs_next[valid_boostrap_idx] bootstrap_valid_back_delta = valid_back_delta[ valid_boostrap_idx] bootstrap_valid_cp_obs = valid_cp_obs[valid_boostrap_idx] bootstrap_valid_cp_act = valid_cp_act[valid_boostrap_idx] feed_dict = self.get_feed_dict( bootstrap_valid_obs, bootstrap_valid_act, bootstrap_valid_delta, bootstrap_valid_obs_next, bootstrap_valid_back_delta, bootstrap_valid_cp_obs, bootstrap_valid_cp_act) v_mse_loss, v_back_mse_loss, v_recon_loss = sess.run( [ self.mse_loss, self.back_mse_loss, self.recon_loss, ], feed_dict=feed_dict) if verbose: logger.log( "Training DynamicsModel - finished epoch %i --" "[Training] mse loss: %.4f back mse loss: %.4f recon loss: %.4f " "[Validation] mse loss: %.4f back mse loss: %.4f recon loss: %.4f epoch time: %.2f" % (epoch, np.mean(mse_losses), np.mean(back_mse_losses), np.mean(recon_losses), v_mse_loss, v_back_mse_loss, v_recon_loss, time.time() - t0)) # Early Stopping with Validation Loss if valid_loss_rolling_average is None: valid_loss_rolling_average = 1.5 * v_recon_loss # set initial rolling to a higher value avoid too early stopping valid_loss_rolling_average_prev = 2 * v_recon_loss if v_recon_loss < 0: valid_loss_rolling_average = v_recon_loss / 1.5 # set initial rolling to a higher value avoid too early stopping valid_loss_rolling_average_prev = v_recon_loss / 2 valid_loss_rolling_average = rolling_average_persitency*valid_loss_rolling_average \ + (1.0-rolling_average_persitency)*v_recon_loss if valid_loss_rolling_average_prev < valid_loss_rolling_average: logger.log( 'Stopping Training of Model since its valid_loss_rolling_average decreased' ) break else: if verbose: logger.log( "Training DynamicsModel - finished epoch %i --" "[Training] mse loss: %.4f back mse loss: %.4f recon loss: %.4f epoch time: %.2f" % (epoch, np.mean(mse_losses), np.mean(back_mse_losses), np.mean(recon_losses), time.time() - t0)) valid_loss_rolling_average_prev = valid_loss_rolling_average """ ------- Tabular Logging ------- """ if log_tabular: logger.logkv('AvgModelEpochTime', np.mean(epoch_times)) logger.logkv('Epochs', epoch)
def learn( *, policy, dynamics_model, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, history_length=10, state_diff=1, load_path='', n_layers=2, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, n_parallel=1, num_rollouts=1, max_path_length=200, seed=0, hidden_size=512, test_range=[], num_test=4, total_test=20, test_interval=0, env_flag='pendulum', normalize_flag=0, no_test_flag=False, only_test_flag=False, cp_dim_output=10, ): f_test_list = [] for i in range(0, num_test): file_name = '%s/test_c%d.txt' % (logger.get_dir(), i) f_test = open(file_name, 'w+') f_test_list.append(f_test) file_name = '%s/test_tot.txt' % (logger.get_dir()) f_test_tot = open(file_name, 'w+') file_name = '%s/train.txt' % (logger.get_dir()) f_train = open(file_name, 'w+') if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) if n_parallel > 1: vec_env = ParallelEnvExecutor(env, n_parallel, num_rollouts, max_path_length) else: vec_env = IterativeEnvExecutor(env, num_rollouts, max_path_length) nenvs = vec_env.num_envs ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches obs_dim = env.observation_space.shape[0] proc_obs_dim = env.proc_observation_space_dims if len(env.action_space.shape) == 0: act_dim = env.action_space.n discrete = True else: act_dim = env.action_space.shape[0] discrete = False make_model = lambda: Model(policy=policy, proc_obs_dim=proc_obs_dim, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, hidden_size=hidden_size, cp_dim_output=cp_dim_output, n_layers=n_layers) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env=env, dynamics_model=dynamics_model, vec_env=vec_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, env_flag=env_flag, normalize_flag=normalize_flag, history_length=history_length, state_diff=state_diff) if load_path: dynamics_model.load(load_path) logger.log("Successfully loaded parameters from {}".format(load_path)) else: logger.log("Failed to load parameters from {}".format(load_path)) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch test_env_list = [] if env_flag == 'cartpole': env_cls = RandomCartPole_Force_Length elif env_flag == 'pendulum': env_cls = RandomPendulumAll elif env_flag == 'halfcheetah': env_cls = HalfCheetahEnv elif env_flag == 'cripple_halfcheetah': env_cls = CrippleHalfCheetahEnv elif env_flag == 'ant': env_cls = AntEnv elif env_flag == 'slim_humanoid': env_cls = SlimHumanoidEnv train_env = env_cls() train_env.seed(0) train_env = normalize(train_env) for i in range(0, num_test): test_env = env_cls(test_range[i][0], test_range[i][1]) test_env.seed(0) test_env = normalize(test_env) vec_test_env = ParallelEnvExecutor(test_env, n_parallel, 10, max_path_length) test_env_list.append(vec_test_env) if n_parallel > 1: vec_train_env = ParallelEnvExecutor(train_env, n_parallel, 10, max_path_length) else: vec_train_env = IterativeEnvExecutor(train_env, 10, max_path_length) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, contexts, states, epinfos = runner.run( ) #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, contexts, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, contexts, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('epminrew', safemin([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('epmaxrew', safemax([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) runner.save(savepath) if not no_test_flag: # TEST if test_interval and update % test_interval == 0 and logger.get_dir( ): train_reward = context_pred_rollout_multi( vec_env=vec_train_env, env=train_env, obs_dim=obs_dim, act_dim=act_dim, discrete=discrete, model=model, history_length=history_length, state_diff=state_diff, test_total=total_test, runner=runner) print("train reward: " + str(train_reward)) f_train.write("{}\n".format(train_reward)) f_train.flush() os.fsync(f_train.fileno()) total_test_reward = 0.0 for i in range(0, num_test): test_reward = context_pred_rollout_multi( vec_env=test_env_list[i], env=test_env, obs_dim=obs_dim, act_dim=act_dim, discrete=discrete, model=model, history_length=history_length, state_diff=state_diff, test_total=total_test, runner=runner) print("test c" + str(i) + " reward: " + str(test_reward)) f_test_list[i].write("{}\n".format(test_reward)) f_test_list[i].flush() os.fsync(f_test_list[i].fileno()) total_test_reward += test_reward f_test_tot.write("{}\n".format(total_test_reward)) f_test_tot.flush() os.fsync(f_test_tot.fileno()) for i in range(0, num_test): f_test_list[i].close() f_test_tot.close() f_train.close() logger.log("Training finished")
def _log_path_stats(self, paths, log=False, log_prefix='', writer=None, itr=None): # compute log stats average_discounted_return = np.mean( [path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] if log == 'reward': logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) if writer is not None: writer.add_scalar("log/AverageReturn", np.mean(undiscounted_returns)) elif log == 'all' or log is True: logger.logkv(log_prefix + 'AverageDiscountedReturn', average_discounted_return) logger.logkv(log_prefix + 'AverageReturn', np.mean(undiscounted_returns)) logger.logkv(log_prefix + 'NumTrajs', len(paths)) logger.logkv(log_prefix + 'StdReturn', np.std(undiscounted_returns)) logger.logkv(log_prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.logkv(log_prefix + 'MinReturn', np.min(undiscounted_returns)) if writer is not None: writer.add_scalar("log/AverageDiscountedReturn", average_discounted_return, itr) writer.add_scalar("log/AverageReturn", np.mean(undiscounted_returns), itr) writer.add_scalar("log/NumTrajs", len(paths), itr) writer.add_scalar("log/StdReturn", np.std(undiscounted_returns), itr) writer.add_scalar("log/MaxReturn", np.max(undiscounted_returns), itr) writer.add_scalar("log/MinReturn", np.min(undiscounted_returns), itr)
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation paths = [] n_samples = 0 running_paths = _get_empty_running_paths_dict() pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy # initial reset of meta_envs obs = np.asarray(self.env.reset()) ts = 0 while n_samples < self.total_samples: # execute policy t = time.time() if random: action = self.env.action_space.sample() agent_info = {} else: action, agent_info = policy.get_action(obs) if action.ndim == 2: action = action[0] policy_time += time.time() - t # step environments t = time.time() next_obs, reward, done, env_info = self.env.step(action) ts += 1 done = done or ts >= self.max_path_length if done: next_obs = self.env.reset() ts = 0 env_time += time.time() - t new_samples = 0 # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths["observations"].append(obs) running_paths["actions"].append(action) running_paths["rewards"].append(reward) running_paths["dones"].append(done) running_paths["env_infos"].append(env_info) running_paths["agent_infos"].append(agent_info) # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray(running_paths["observations"]), actions=np.asarray(running_paths["actions"]), rewards=np.asarray(running_paths["rewards"]), dones=np.asarray(running_paths["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths["agent_infos"]), )) new_samples += len(running_paths["rewards"]) running_paths = _get_empty_running_paths_dict() pbar.update(new_samples) n_samples += new_samples obs = next_obs pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths
def train(self): """ Collects data and trains the dynamics model """ f_test_list = [] for i in range(0, self.num_test): file_name = '%s/test_c%d.txt' % (logger.get_dir(), i) f_test = open(file_name, 'w+') f_test_list.append(f_test) file_name = '%s/test_tot.txt' % (logger.get_dir()) f_test_tot = open(file_name, 'w+') file_name = '%s/train.txt' % (logger.get_dir()) f_train = open(file_name, 'w+') itr_times = [] t0 = time.time() test_env_list = [] if self.env_flag == 'cartpole': env_cls = RandomCartPole_Force_Length elif self.env_flag == 'pendulum': env_cls = RandomPendulumAll elif self.env_flag == 'halfcheetah': env_cls = HalfCheetahEnv elif self.env_flag == 'cripple_halfcheetah': env_cls = CrippleHalfCheetahEnv elif self.env_flag == 'ant': env_cls = AntEnv elif self.env_flag == 'slim_humanoid': env_cls = SlimHumanoidEnv else: raise ValueError(self.env_flag) train_env = env_cls() train_env.seed(0) train_env = normalize(train_env) for i in range(0, self.num_test): test_env = env_cls(self.test_range[i][0], self.test_range[i][1]) test_env.seed(0) test_env = normalize(test_env) vec_test_env = ParallelEnvExecutor(test_env, self.test_n_parallel, self.test_num_rollouts, self.test_max_epochs) test_env_list.append(vec_test_env) if len(train_env.action_space.shape) == 0: act_dim = train_env.action_space.n discrete = True else: act_dim = train_env.action_space.shape[0] discrete = False with self.sess.as_default() as sess: sess.run(tf.compat.v1.initializers.global_variables()) start_time = time.time() for itr in range(self.start_itr, self.n_itr): if not self.only_test: itr_start_time = time.time() logger.log( "\n ---------------- Iteration %d ----------------" % itr) time_env_sampling_start = time.time() if self.initial_random_samples and itr == 0: logger.log( "Obtaining random samples from the environment...") env_paths = self.sampler.obtain_samples(log=True, random=True, log_prefix='') else: logger.log( "Obtaining samples from the environment using the policy..." ) env_paths = self.sampler.obtain_samples(log=True, log_prefix='') logger.record_tabular( 'Time-EnvSampling', time.time() - time_env_sampling_start) ''' -------------- Process the samples ----------------''' logger.log("Processing environment samples...") time_env_samp_proc = time.time() samples_data = self.sample_processor.process_samples( env_paths, log=True, itr=itr) logger.record_tabular('Time-EnvSampleProc', time.time() - time_env_samp_proc) ''' --------------- Fit the dynamics model --------------- ''' time_fit_start = time.time() logger.log("Training dynamics model for %i epochs ..." % (self.dynamics_model_max_epochs)) if self.context: self.dynamics_model.fit( samples_data['concat_obs'], samples_data['concat_act'], samples_data['concat_next_obs'], samples_data['cp_observations'], samples_data['cp_actions'], samples_data['concat_bool'], epochs=self.dynamics_model_max_epochs, verbose=True, log_tabular=True) else: self.dynamics_model.fit( samples_data['observations'], samples_data['actions'], samples_data['next_observations'], epochs=self.dynamics_model_max_epochs, verbose=True, log_tabular=True) logger.record_tabular('Time-ModelFit', time.time() - time_fit_start) """ ------------------- Logging --------------------------""" logger.logkv('Itr', itr) logger.logkv('n_timesteps', self.sampler.total_timesteps_sampled) logger.logkv('Time', time.time() - start_time) logger.logkv('ItrTime', time.time() - itr_start_time) logger.log("Saving snapshot...") params = self.get_itr_snapshot(itr) self.log_diagnostics(env_paths, '') logger.save_itr_params(itr, params) print(logger.get_dir()) checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, 'params_epoch_{}'.format(itr)) self.dynamics_model.save(savepath) logger.log("Saved") logger.dumpkvs() else: logger.log("Test - {}/{} iterations".format( itr + 1, self.n_itr)) checkdir = osp.join(logger.get_dir(), 'checkpoints') loadpath = osp.join(checkdir, 'params_epoch_{}'.format(itr)) self.dynamics_model.load(loadpath) logger.log("Succesfully loaded parameters from {}".format( loadpath)) if itr != 0: itr_times.append(time.time() - t0) avg_itr_time = np.mean(itr_times) eta = avg_itr_time * (self.n_itr - itr) / 60. logger.log( "Test - {}/{} iterations | ETA: {:.2f} mins". format(itr + 1, self.n_itr, eta)) t0 = time.time() if self.no_test: print('no test') else: if itr % 1 == 0 or itr == self.n_itr - 1: if self.context: rollout = context_rollout_multi else: rollout = rollout_multi total_test_reward = 0.0 for i in range(0, self.num_test): test_reward = rollout( vec_env=test_env_list[i], policy=self.policy, discrete=discrete, num_rollouts=self.test_num_rollouts, test_total=self.total_test, act_dim=act_dim, use_cem=self.use_cem, horizon=self.horizon, context=self.context, history_length=self.history_length, state_diff=self.state_diff) print("test c" + str(i) + " reward: " + str(test_reward)) f_test_list[i].write("{}\n".format(test_reward)) f_test_list[i].flush() os.fsync(f_test_list[i].fileno()) self.writer.add_scalar("test/c{}".format(i), test_reward, itr) total_test_reward += test_reward / self.num_test f_test_tot.write("{}\n".format(total_test_reward)) f_test_tot.flush() os.fsync(f_test_tot.fileno()) self.writer.add_scalar("test/total_test", total_test_reward, itr) if itr == 1: sess.graph.finalize() for i in range(0, self.num_test): f_test_list[i].close() f_test_tot.close() f_train.close() logger.log("Training finished") self.sess.close()
def obtain_samples(self, log=False, log_prefix='', random=False): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (list): A list of dicts with the samples """ # initial setup / preparation paths = [] n_samples = 0 num_envs = self.vec_env.num_envs running_paths = [ _get_empty_running_paths_dict() for _ in range(num_envs) ] pbar = ProgBar(self.total_samples) policy_time, env_time = 0, 0 policy = self.policy if self.use_cem: for i in range(num_envs): self.reset_cem(i) # initial reset of meta_envs obses = np.asarray(self.vec_env.reset()) state_counts = [0] * self.vec_env.num_envs # history self.obs_dim = obses.shape[1] history_state = np.zeros( (obses.shape[0], self.obs_dim * self.history_length)) history_act = np.zeros( (obses.shape[0], self.act_dim * self.history_length)) while n_samples < self.total_samples: # execute policy t = time.time() if random: actions = np.stack( [self.env.action_space.sample() for _ in range(num_envs)], axis=0) agent_infos = {} else: if self.use_cem: if self.context: cem_solutions, agent_infos = policy.get_actions( obses, init_mean=self.prev_sol, init_var=self.init_var, cp_obs=history_state, cp_act=history_act) else: cem_solutions, agent_infos = policy.get_actions( obses, init_mean=self.prev_sol, init_var=self.init_var) self.prev_sol[:, :-1] = cem_solutions[:, 1:].copy() self.prev_sol[:, -1:] = 0. actions = cem_solutions[:, 0].copy() else: if self.context: actions, agent_infos = policy.get_actions( obses, cp_obs=history_state, cp_act=history_act) else: actions, agent_infos = policy.get_actions(obses) if len(self.env.action_space.shape) == 0: actions = actions.reshape(-1) policy_time += time.time() - t # step environments t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t # stack agent_infos and if no infos were provided (--> None) create empty dicts agent_infos, env_infos = self._handle_info_dicts( agent_infos, env_infos) new_samples = 0 for idx, observation, action, reward, env_info, agent_info, done in zip( itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if len(self.env.action_space.shape) == 0: action = np.eye(self.act_dim)[action] else: if action.ndim == 0: action = np.expand_dims(action, 0) assert action.ndim == 1, (action, action.shape) # append new samples to running paths if isinstance(reward, np.ndarray): reward = reward[0] running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["dones"].append(done) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) running_paths[idx]["cp_obs"].append(history_state[idx].copy()) running_paths[idx]["cp_act"].append(history_act[idx].copy()) # making a history buffer if state_counts[idx] < self.history_length: if self.state_diff: history_state[idx][state_counts[idx] * self.obs_dim:( state_counts[idx] + 1) * self.obs_dim] = next_obses[idx] - observation else: history_state[idx][state_counts[idx] * self.obs_dim:(state_counts[idx] + 1) * self.obs_dim] = observation history_act[idx][state_counts[idx] * self.act_dim:(state_counts[idx] + 1) * self.act_dim] = action else: history_state[idx][:-self.obs_dim] = history_state[idx][ self.obs_dim:] if self.state_diff: history_state[idx][ -self.obs_dim:] = next_obses[idx] - observation else: history_state[idx][-self.obs_dim:] = observation history_act[idx][:-self. act_dim] = history_act[idx][self.act_dim:] history_act[idx][-self.act_dim:] = action # if running path is done, add it to paths and empty the running path if done: paths.append( dict( observations=np.asarray( running_paths[idx]["observations"]), actions=np.asarray(running_paths[idx]["actions"]), rewards=np.asarray(running_paths[idx]["rewards"]), dones=np.asarray(running_paths[idx]["dones"]), env_infos=utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]), cp_obs=np.asarray(running_paths[idx]["cp_obs"]), cp_act=np.asarray(running_paths[idx]["cp_act"]), )) new_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = _get_empty_running_paths_dict() if not random and self.use_cem: self.reset_cem(idx) state_counts[idx] = 0 history_state[idx] = np.zeros( (self.obs_dim * self.history_length)) history_act[idx] = np.zeros( (self.act_dim * self.history_length)) else: state_counts[idx] += 1 pbar.update(self.vec_env.num_envs) n_samples += new_samples obses = next_obses pbar.stop() self.total_timesteps_sampled += self.total_samples if log: logger.logkv(log_prefix + "PolicyExecTime", policy_time) logger.logkv(log_prefix + "EnvExecTime", env_time) return paths