observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) d_rewards_var = TT.vector('d_rewards') importance_weights_var = TT.vector('importance_weight') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) snap_dist_info_vars = snap_policy.dist_info_sym(observations_var) surr = TT.sum(- dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) snap_params = snap_policy.get_params(trainable=True) importance_weights = dist.likelihood_ratio_sym_1traj_GPOMDP(actions_var,snap_dist_info_vars,dist_info_vars) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0',dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1',dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3',dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4',dtype=grad[3].dtype)
# Number of iterations n_itr = 1000 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.00005 observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) d_rewards_var = TT.vector('d_rewards') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) surr = TT.sum( -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) * d_rewards_var) params = policy.get_params(trainable=True) grad = theano.grad(surr, params) eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype) eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype) eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype) eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype) eval_grad5 = TT.vector('eval_grad5', dtype=grad[4].dtype)
# type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params)
def run_task(*_): # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize( GymEnv(env_name="LunarLanderContinuous-v2", force_reset=True)) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(64, 64)) # Initialize a linear baseline estimator using default hand-crafted features baseline = LinearFeatureBaseline(env.spec) # We will collect 100 trajectories per iteration N = 3 # Each trajectory will have at most 100 time steps T = 400 # Number of iterations n_itr = 1000 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.001 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1) actions_var = env.action_space.new_tensor_variable('actions', extra_dims=1) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = -TT.mean( dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True) for epoch in range(n_itr): logger.push_prefix('epoch #%d | ' % epoch) logger.log("Training started") paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) f_train(observations, actions, advantages) returns_to_check = [sum(p["rewards"]) for p in paths] print('Average Return:', np.mean(returns_to_check)) ############################################################################ logger.log("Training finished") logger.save_itr_params(epoch, params) logger.dump_tabular(with_prefix=False) logger.pop_prefix() logger.record_tabular('Epoch', epoch) logger.record_tabular('Steps', epoch * N * T) logger.record_tabular('AverageReturn', np.mean(returns_to_check)) logger.record_tabular('StdReturn', np.std(returns_to_check)) logger.record_tabular('MaxReturn', np.max(returns_to_check)) logger.record_tabular('MinReturn', np.min(returns_to_check))
def doit(mode): from rllab.envs.box2d.cartpole_env import CartpoleEnv from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy from rllab.envs.normalized_env import normalize import numpy as np import theano import theano.tensor as TT from lasagne.updates import adam # normalize() makes sure that the actions for the environment lies # within the range [-1, 1] (only works for environments with continuous actions) env = normalize(CartpoleEnv()) # Initialize a neural network policy with a single hidden layer of 8 hidden units policy = GaussianMLPPolicy(env.spec, hidden_sizes=(8,)) # Initialize a linear baseline estimator using default hand-crafted features if "linbaseline" in mode: print('linear baseline') baseline = LinearFeatureBaseline(env.spec) elif "vanilla" in mode: print("zero baseline") baseline = ZeroBaseline(env.spec) elif mode == "batchavg": print('batch average baseline') # use a zero baseline but subtract the mean of the discounted returns (see below) baseline = ZeroBaseline(env.spec) if "_ztrans" in mode: print('z transform advantages') else: print('no z transform') # We will collect 100 trajectories per iteration N = 50 # Each trajectory will have at most 100 time steps T = 50 # Number of iterations n_itr = 50 # Set the discount factor for the problem discount = 0.99 # Learning rate for the gradient update learning_rate = 0.1 # Construct the computation graph # Create a Theano variable for storing the observations # We could have simply written `observations_var = TT.matrix('observations')` instead for this example. However, # doing it in a slightly more abstract way allows us to delegate to the environment for handling the correct data # type for the variable. For instance, for an environment with discrete observations, we might want to use integer # types if the observations are represented as one-hot vectors. observations_var = env.observation_space.new_tensor_variable( 'observations', # It should have 1 extra dimension since we want to represent a list of observations extra_dims=1 ) actions_var = env.action_space.new_tensor_variable( 'actions', extra_dims=1 ) advantages_var = TT.vector('advantages') # policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the # distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation. dist_info_vars = policy.dist_info_sym(observations_var) # policy.distribution returns a distribution object under rllab.distributions. It contains many utilities for computing # distribution-related quantities, given the computed dist_info_vars. Below we use dist.log_likelihood_sym to compute # the symbolic log-likelihood. For this example, the corresponding distribution is an instance of the class # rllab.distributions.DiagonalGaussian dist = policy.distribution # Note that we negate the objective, since most optimizers assume a # minimization problem surr = - TT.mean(dist.log_likelihood_sym(actions_var, dist_info_vars) * advantages_var) # Get the list of trainable parameters. params = policy.get_params(trainable=True) grads = theano.grad(surr, params) f_train = theano.function( inputs=[observations_var, actions_var, advantages_var], outputs=None, updates=adam(grads, params, learning_rate=learning_rate), allow_input_downcast=True ) results = [] for _ in range(n_itr): paths = [] for _ in range(N): observations = [] actions = [] rewards = [] observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory path = dict( observations=np.array(observations), actions=np.array(actions), rewards=np.array(rewards), ) path_baseline = baseline.predict(path) advantages = [] returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far) advantage = return_so_far - path_baseline[t] advantages.append(advantage) # The advantages are stored backwards in time, so we need to revert it advantages = np.array(advantages[::-1]) # And we need to do the same thing for the list of returns returns = np.array(returns[::-1]) if "_ztrans" in mode: advantages = (advantages - np.mean(advantages)) / (np.std(advantages) + 1e-8) path["advantages"] = advantages path["returns"] = returns paths.append(path) baseline.fit(paths) observations = np.concatenate([p["observations"] for p in paths]) actions = np.concatenate([p["actions"] for p in paths]) advantages = np.concatenate([p["advantages"] for p in paths]) if mode == 'batchavg': # in this case `advantages` up to here are just our good old returns, without baseline or z transformation. # now we subtract their mean across all episodes. advantages = advantages - np.mean(advantages) f_train(observations, actions, advantages) avgr = np.mean([sum(p["rewards"]) for p in paths]) print(('Average Return:',avgr)) results.append(avgr) return results
class Bw_Trans_Model: def __init__(self, inputSize, outputSize, env, v, learning_rate, batchsize, which_agent, x_index, y_index, num_fc_layers, depth_fc_layers, print_minimal): #init vars #self.sess = sess self.batchsize = batchsize self.which_agent = which_agent self.x_index = x_index self.y_index = y_index self.inputSize = inputSize self.outputSize = outputSize self.print_minimal = print_minimal LOW = -1000000 HIGH = 1000000 self.act_dim = env.spec.action_space.flat_dim self.obs_dim = env.spec.observation_space.flat_dim obs_to_act_spec = env.spec obsact_to_obs_spec = EnvSpec(observation_space=Box( LOW, HIGH, shape=(self.obs_dim + self.act_dim, )), action_space=Box(LOW, HIGH, shape=(self.obs_dim, ))) #TODO: Think, whether to learn std for backwards policy or not. self.bw_act_pol = GaussianMLPPolicy( env_spec=obs_to_act_spec, hidden_sizes=(64, 64), learn_std=v['bw_variance_learn'], ) self.bw_obs_pol = GaussianMLPPolicy( env_spec=obsact_to_obs_spec, hidden_sizes=(v['bw_model_hidden_size'], v['bw_model_hidden_size']), learn_std=v['bw_variance_learn'], hidden_nonlinearity=NL.rectify, ) self.obs_in = TT.matrix('obs_in') self.obsact_in = TT.matrix('obsact_in') self.act_out = TT.matrix('act_out') self.diff_out = TT.matrix('diff_out') bw_learning_rate = v['bw_learning_rate'] self.bw_act_dist = self.bw_act_pol.dist_info_sym(self.obs_in) self.bw_obs_dist = self.bw_obs_pol.dist_info_sym(self.obsact_in) self.bw_act_loss = -TT.sum( self.bw_act_pol.distribution.log_likelihood_sym( self.act_out, self.bw_act_dist)) bw_obs_loss = -TT.sum( self.bw_obs_pol.distribution.log_likelihood_sym( self.diff_out, self.bw_obs_dist)) bw_act_params = self.bw_act_pol.get_params_internal() bw_obs_params = self.bw_obs_pol.get_params_internal() #bw_params = bw_act_params + bw_obs_params bw_s_to_a_update = lasagne.updates.adam(self.bw_act_loss, bw_act_params, learning_rate=bw_learning_rate) bw_sa_to_s_update = lasagne.updates.adam( bw_obs_loss, bw_obs_params, learning_rate=bw_learning_rate) self.bw_act_train = theano.function([self.obs_in, self.act_out], self.bw_act_loss, updates=bw_s_to_a_update, allow_input_downcast=True) self.bw_obs_train = theano.function([self.obsact_in, self.diff_out], bw_obs_loss, updates=bw_sa_to_s_update, allow_input_downcast=True) def train(self, dataX, dataZ, dataX_new, dataZ_new, nEpoch, save_dir, fraction_use_new): #init vars start = time.time() training_loss_list = [] nData_old = dataX.shape[0] num_new_pts = dataX_new.shape[0] #how much of new data to use per batch if (num_new_pts < (self.batchsize * fraction_use_new)): batchsize_new_pts = num_new_pts #use all of the new ones else: batchsize_new_pts = int(self.batchsize * fraction_use_new) #how much of old data to use per batch batchsize_old_pts = int(self.batchsize - batchsize_new_pts) #training loop for i in range(nEpoch): #reset to 0 avg_loss = 0 num_batches = 0 if (batchsize_old_pts > 0): print("nothing is going on") #train completely from new set else: for batch in range( int(math.floor(num_new_pts / batchsize_new_pts))): #walk through the shuffled new data dataX_batch = dataX_new[batch * batchsize_new_pts:(batch + 1) * batchsize_new_pts, :] dataZ_batch = dataZ_new[batch * batchsize_new_pts:(batch + 1) * batchsize_new_pts, :] data_x = dataX_batch[:, 0:self.obs_dim] data_y = dataX_batch[:, self.obs_dim:] loss = self.bw_act_train(data_x, data_y) bw_obs_losses = self.bw_obs_train(dataX_batch, dataZ_batch) training_loss_list.append(loss) avg_loss += bw_obs_losses #[0] num_batches += 1 #shuffle new dataset after an epoch (if training only on it) p = npr.permutation(dataX_new.shape[0]) dataX_new = dataX_new[p] dataZ_new = dataZ_new[p] #save losses after an epoch np.save(save_dir + '/training_losses.npy', training_loss_list) if (not (self.print_minimal)): if ((i % 10) == 0): print("\n=== Epoch {} ===".format(i)) print("loss: ", avg_loss / num_batches) if (not (self.print_minimal)): print("Training set size: ", (nData_old + dataX_new.shape[0])) print("Training duration: {:0.2f} s".format(time.time() - start)) #done return (avg_loss / num_batches) #, old_loss, new_loss #multistep prediction using the learned dynamics model at each step def do_forward_sim(self, forwardsim_x_true, num_step, many_in_parallel, env_inp, which_agent, mean_x, mean_y, mean_z, std_x, std_y, std_z): #init vars state_list = [] action_list = [] if (many_in_parallel): #init vars print("Future work..") else: curr_state = np.copy( forwardsim_x_true) #curr state is of dim NN input for i in range(num_step): curr_state_preprocessed = curr_state - mean_x curr_state_preprocessed = np.nan_to_num( curr_state_preprocessed / std_x) action = self.bw_act_pol.get_action(curr_state_preprocessed)[0] action_ = action * std_y + mean_y state_difference = self.bw_obs_pol.get_action( np.concatenate((curr_state_preprocessed, action)))[0] state_differences = (state_difference * std_z) + mean_z next_state = curr_state + state_differences #copy the state info curr_state = np.copy(next_state) state_list.append(np.copy(curr_state)) action_list.append(np.copy(action_)) return state_list, action_list
def run_task(v): env, _ = create_env(v["which_agent"]) fw_learning_rate = v['fw_learning_rate'] # 0.0005! yaml_path = os.path.abspath('yaml_files/' + v['yaml_file'] + '.yaml') assert (os.path.exists(yaml_path)) with open(yaml_path, 'r') as f: params = yaml.load(f) num_fc_layers = params['dyn_model']['num_fc_layers'] depth_fc_layers = params['dyn_model']['depth_fc_layers'] batchsize = params['dyn_model']['batchsize'] lr = params['dyn_model']['lr'] print_minimal = v['print_minimal'] nEpoch = params['dyn_model']['nEpoch'] save_dir = os.path.join(args.save_dir, v['exp_name']) inputSize = env.spec.action_space.flat_dim + env.spec.observation_space.flat_dim outputSize = env.spec.observation_space.flat_dim #Initialize the forward policy policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(64, 64)) #learn_std=False, #v['learn_std'], #adaptive_std=False, #v['adaptive_std'], #output_gain=1, #v['output_gain'], #init_std=1) #v['polic) baseline = LinearFeatureBaseline(env_spec=env.spec) #Update function for the forward policy (immitation learning loss!) fwd_obs = TT.matrix('fwd_obs') fwd_act_out = TT.matrix('act_out') policy_dist = policy.dist_info_sym(fwd_obs) fw_loss = -TT.sum( policy.distribution.log_likelihood_sym(fwd_act_out, policy_dist)) fw_params = policy.get_params_internal() fw_update = lasagne.updates.adam(fw_loss, fw_params, learning_rate=fw_learning_rate) fw_func = theano.function([fwd_obs, fwd_act_out], fw_loss, updates=fw_update, allow_input_downcast=True) log_dir = v['yaml_file'] print('Logging Tensorboard to: %s' % log_dir) hist_logger = hist_logging(log_dir) optimizer_params = dict(base_eps=1e-5) if not os.path.exists(save_dir): os.makedirs(save_dir) os.makedirs(save_dir + '/losses') os.makedirs(save_dir + '/models') os.makedirs(save_dir + '/saved_forwardsim') os.makedirs(save_dir + '/saved_trajfollow') os.makedirs(save_dir + '/training_data') x_index, y_index, z_index, yaw_index,\ joint1_index, joint2_index, frontleg_index,\ frontshin_index, frontfoot_index, xvel_index, orientation_index = get_indices(v['which_agent']) dyn_model = Bw_Trans_Model(inputSize, outputSize, env, v, lr, batchsize, v['which_agent'], x_index, y_index, num_fc_layers, depth_fc_layers, print_minimal) for outer_iter in range(1, v['outer_iters']): algo = TRPO( env=env, policy=policy, baseline=baseline, batch_size=v["batch_size"], max_path_length=v["steps_per_rollout"], n_itr=v["num_trpo_iters"], discount=0.995, optimizer=v["ConjugateGradientOptimizer"]( hvp_approach=v["FiniteDifferenceHvp"](**optimizer_params)), step_size=0.05, plot_true=True) all_paths = algo.train() #Collect the trajectories, using these trajectories which leads to high value states # learn a backwards model! observations_list = [] actions_list = [] rewards_list = [] returns_list = [] for indexing in all_paths: for paths in indexing: observations = [] actions = [] returns = [] reward_for_rollout = 0 for i_ in range(len(paths['observations'])): #since, we are building backwards model using trajectories, #so, reversing the trajectories. index_ = len(paths['observations']) - i_ - 1 observations.append(paths['observations'][index_]) actions.append(paths['actions'][index_]) returns.append(paths['returns'][index_]) reward_for_rollout += paths['rewards'][index_] #if something_ == 1: # actions_bw.append(path['actions'][::-1]) # observations_bw.append(path['observations'][::-1]) observations_list.append(observations) actions_list.append(actions) rewards_list.append(reward_for_rollout) returns_list.append(returns) hist_logger.log_scalar(save_dir, np.sum(rewards_list) / len(rewards_list), outer_iter * v["num_trpo_iters"]) selected_observations_list = [] selected_observations_list_for_state_seletection = [] selected_actions_list = [] selected_returns_list = [] #Figure out how to build the backwards model. #Conjecture_1 #------- Take quantile sample of trajectories which recieves highest cumulative rewards! number_of_trajectories = int( np.floor(v['top_k_trajectories'] * len(rewards_list) / 100)) rewards_list_np = np.asarray(rewards_list) trajectory_indices = rewards_list_np.argsort( )[-number_of_trajectories:][::-1] for index_ in range(len(trajectory_indices)): selected_observations_list.append( observations_list[trajectory_indices[index_]]) selected_actions_list.append( actions_list[trajectory_indices[index_]]) selected_observations_list_for_state_selection = [] number_of_trajectories = int( np.floor(v['top_k_trajectories_state_selection'] * len(rewards_list) / 100)) rewards_list_np = np.asarray(rewards_list) trajectory_indices = rewards_list_np.argsort( )[-number_of_trajectories:][::-1] for index_ in range(len(trajectory_indices)): selected_observations_list_for_state_seletection.append( observations_list[trajectory_indices[index_]]) selected_returns_list.append( returns_list[trajectory_indices[index_]]) #Figure out from where to start the backwards model. #Conjecture_1 #------ Take quantile sample of high value states, and start the backwards model from them! #which amounts to just taking a non parametric buffer of high values states, which should be #fine! if v['use_good_trajectories'] == 1: returns_list = selected_returns_list observations_list = selected_observations_list_for_state_selection flatten_ret_list = np.asarray(returns_list).flatten() flatten_obs_list = np.vstack(np.asarray(observations_list)) number_of_bw_samples = int( np.floor(v['top_k_bw_samples'] * len(flatten_ret_list) / 100)) samples_indices = flatten_ret_list.argsort( )[-number_of_bw_samples:][::-1] bw_samples = [] for bw_index in range(len(samples_indices)): bw_samples.append(flatten_obs_list[samples_indices[bw_index]]) #Not all parts of the state are actually used. states = from_observation_to_usablestate(selected_observations_list, v["which_agent"], False) controls = selected_actions_list dataX, dataY = generate_training_data_inputs(states, controls) states = np.asarray(states) dataZ = generate_training_data_outputs(states, v['which_agent']) #every component (i.e. x position) should become mean 0, std 1 dataX, mean_x, std_x = zero_mean_unit_std(dataX) dataY, mean_y, std_y = zero_mean_unit_std(dataY) dataZ, mean_z, std_z = zero_mean_unit_std(dataZ) ## concatenate state and action, to be used for training dynamics inputs = np.concatenate((dataX, dataY), axis=1) outputs = np.copy(dataZ) assert inputs.shape[0] == outputs.shape[0] if v['num_imagination_steps'] == 10: nEpoch = 20 elif v['num_imagination_steps'] == 50: nEpoch = 20 elif v['num_imagination_steps'] == 100: nEpoch = 30 else: nEpoch = 20 nEpoch = v['nEpoch'] training_loss = dyn_model.train(inputs, outputs, inputs, outputs, nEpoch, save_dir, 1) print("Training Loss for Backwards model", training_loss) if v['running_baseline'] == False: for goal_ind in range(min(v['fw_iter'], len(bw_samples))): #train the backwards model #Give inital state, perform rollouts from backwards model.Right now, state is random, but it should #be selected from some particular list forwardsim_x_true = bw_samples[goal_ind] state_list, action_list = dyn_model.do_forward_sim( forwardsim_x_true, v['num_imagination_steps'], False, env, v['which_agent'], mean_x, mean_y, mean_z, std_x, std_y, std_z) #Incorporate the backwards trace into model based system. fw_func(np.vstack(state_list), np.vstack(action_list)) #print("Immitation Learning loss", loss) else: print('running TRPO baseline')