def get_action(self, state): obs, obs_list, obs_next_list, act_list = [], [], [], [] [obs.append(state) for _ in range(self.num_simulated_paths)] for _ in range(self.horizon): obs_list.append(obs) # get random actions actions = [] [ actions.append(self.env.action_space.sample()) for _ in range(self.num_simulated_paths) ] act_list.append(actions) obs = self.dyn_model.predict(np.array(obs), np.array(actions)) obs_next_list.append(obs) trajectory_cost_list = trajectory_cost_fn(self.cost_fn, np.array(obs_list), np.array(act_list), np.array(obs_next_list)) j = np.argmin(trajectory_cost_list) return act_list[0][j]
def get_action(self, state): # Broadcast state for batch calculations batch_state = np.tile(state,(self.num_simulated_paths,1)) best_traj = None state_traj = batch_state[np.newaxis,:] for i in range(self.horizon): # Sample actions batch_action = self.action_low+np.random.uniform(size=(self.num_simulated_paths,self.ac_dim))*(self.action_high-self.action_low) # Simulate dynamics from NN model batch_state = self.dyn_model.predict(batch_state,batch_action) # Add next state batch to trajectory state_traj = np.vstack((state_traj,batch_state[np.newaxis,:])) # Add actions to sequence if i == 0: action_seq = batch_action[np.newaxis,:] else: action_seq = np.vstack((action_seq,batch_action[np.newaxis,:])) s_t_traj = state_traj[0:self.horizon,:,:] s_tp1_traj = state_traj[1:self.horizon+1,:,:] traj_cost = trajectory_cost_fn(self.cost_fn,s_t_traj,action_seq,s_tp1_traj) best_traj = np.argmin(traj_cost) # Get best action best_action = action_seq[0,best_traj,:].reshape((self.ac_dim)) return best_action """ Note: be careful to batch your simulations through the model for speed """
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ obs = state * np.ones( (self.num_simulated_paths, state.shape[0])) # (paths, obs_dim) observations = [] # [(paths, obs_dim), ...] actions = [] # [(paths, action_dim), ...] next_observations = [] # [(paths, obs_dim), ...] for i in range(self.horizon): # sample from action candidates (instead of calling env.action_space.sample() every iteration) random_idx = np.random.choice(self.action_candidates.shape[0], obs.shape[0], replace=False) action = self.action_candidates[random_idx] #action = np.array([self.env.action_space.sample() for i in range(self.num_simulated_paths)]) observations += [obs] actions += [action] obs = self.dyn_model.predict(obs, action) next_observations += [obs] costs = trajectory_cost_fn(self.cost_fn, observations, actions, next_observations) # (paths, ) return actions[0][np.argmin(costs)]
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ action_paths = self.sample_random_actions() # get init observations and copy num_simulated_paths times states = np.tile(state, [self.num_simulated_paths, 1]) states_paths_all = [] states_paths_all.append(states) for i in range(self.horizon): states = self.dyn_model.predict(states, action_paths[i, :, :]) states_paths_all.append(states) # evaluate trajectories states_paths_all = np.asarray(states_paths_all) # batch cost function states_paths = states_paths_all[:-1, :, :] states_nxt_paths = states_paths_all[1:, :, :] costs = trajectory_cost_fn(self.cost_fn, states_paths, action_paths, states_nxt_paths) min_cost_path = np.argmin(costs) opt_cost = costs[min_cost_path] opt_action_path = action_paths[:, min_cost_path, :] opt_action = copy.copy(opt_action_path[0]) # print("MPC imagine min cost: ", opt_cost) return opt_action
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ obs_dim = self.env.observation_space.shape[0] a_dim = self.env.action_space.shape[0] actions = np.zeros((self.horizon, self.num_simulated_paths, a_dim)) states = np.zeros((self.horizon, self.num_simulated_paths, obs_dim)) prime_states = np.zeros( (self.horizon, self.num_simulated_paths, obs_dim)) states[0, :, :] = np.full((self.num_simulated_paths, obs_dim), state) for step in range(self.horizon): states_t = states[step, :, :] a_t = np.array([ self.env.action_space.sample() for _ in range(self.num_simulated_paths) ]) ps_t = self.dyn_model.predict(states_t, a_t) actions[step, :, :] = a_t prime_states[step, :, :] = ps_t if step < self.horizon - 1: states[step + 1, :, :] = ps_t costs = trajectory_cost_fn(self.cost_fn, states, actions, prime_states) best_traj_index = np.argmin(costs, axis=0) return actions[0][best_traj_index]
def get_action(self, state): """ Note: be careful to batch your simulations through the model for speed """ states = np.repeat(np.expand_dims(state, axis=0), self.num_simulated_paths, axis=0) def get_rand_actions(): actions = [] for i in range(self.num_simulated_paths): actions.append(self.env.action_space.sample()) actions = np.array(actions) return actions actions = get_rand_actions() states_traj = [] next_states_traj = [] actions_traj = [] for i in range(self.horizon): next_states = self.dyn_model.predict(states, actions) actions_traj.append(actions) states_traj.append(states) next_states_traj.append(next_states) states = next_states actions = get_rand_actions() states_traj = np.swapaxes(np.array(states_traj), 0, 1) actions_traj = np.swapaxes(np.array(actions_traj), 0, 1) next_states_traj = np.swapaxes(np.array(next_states_traj), 0, 1) cost_val = trajectory_cost_fn(self.cost_fn, states_traj, actions_traj, next_states_traj) #ipdb.set_trace() return actions_traj[0, np.argmin(cost_val), :]
def get_action(self, state): """ Note: be careful to batch your simulations through the model for speed """ state_batch, states, next_states, actions = [], [], [], [] #state batches have dimension (K, dim(state)) for _ in range(self.num_simulated_paths): state_batch.append(state) for _ in range(self.horizon): action = [] for _ in range(self.num_simulated_paths): action.append(self.env.action_space.sample()) actions.append(action) states.append(state_batch) #use batch for speed state_batch = self.dyn_model.predict(np.array(state_batch), np.array(action)) next_states.append(state_batch) costs = trajectory_cost_fn(self.cost_fn, np.array(states), np.array(actions), np.array(next_states)) j_star = np.argmin(np.array(costs)) return actions[0][j_star]
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ action_dim = self.env.action_space.shape[0] # return np.zeros(self.env.action_space.shape) state_dim = self.env.observation_space.shape[0] path_actions = np.zeros( [self.horizon, self.num_simulated_paths, action_dim]) path_states = np.zeros( [self.horizon, self.num_simulated_paths, state_dim]) path_next_states = np.zeros( [self.horizon, self.num_simulated_paths, state_dim]) states = np.ones([self.num_simulated_paths, state_dim ]) * state.reshape([-1, state_dim]) for i in range(self.horizon): path_states[i] = state path_actions[i] = np.asarray([ self.env.action_space.sample() for _ in range(self.num_simulated_paths) ]) states = self.dyn_model.predict(states, path_actions[i]) path_next_states[i] = states path_costs = trajectory_cost_fn(self.cost_fn, path_states, path_actions, path_next_states) best = np.argmin(path_costs) return path_actions[0, best]
def get_action(self, state): # horizon * num_paths* state_dim or action_dim trajs = np.array([ self.env.action_space.sample() for p in range(self.horizon * self.num_simulated_paths) ]).reshape((self.horizon, self.num_simulated_paths, -1)) #print "Trajs dimension", trajs.shape #TODO: action_dim #n_paths*horizon*action_dim accum_states = np.zeros( (self.horizon, self.num_simulated_paths, state.shape[0])) accum_next_states = np.zeros( (self.horizon, self.num_simulated_paths, state.shape[0])) states = np.ones( (self.num_simulated_paths, 1)) * state # n_paths*state_dim #print "state", state #print "accum_states", accum_states #print "accum_states shape", accum_states.shape for time_idx in range(self.horizon): actions = trajs[time_idx, :, :] # n_paths * action_dim #print "action dimension", actions.shape next_states = self.dyn_model.predict(states, actions) #print "next states dimension", next_states.shape accum_states[time_idx, :, :] = states accum_next_states[time_idx, :, :] = next_states states = next_states.copy( ) #set the states to be next_states for next time_idx cost = trajectory_cost_fn(self.cost_fn, accum_states, trajs, accum_next_states) #print "cost shape", cost.shape #print "min cost", np.min(cost) #print "min cost arg way", cost[np.argmin(cost)] action = trajs[0, np.argmin(cost), :] # first action for min rollout return action.flatten()
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ current_states = np.zeros((self.horizon, self.num_simulated_paths, self.env.observation_space.shape[0])) actions = np.zeros((self.horizon, self.num_simulated_paths, self.env.action_space.shape[0])) next_states = np.zeros((self.horizon, self.num_simulated_paths, self.env.observation_space.shape[0])) current_states[0, :, :] = state for h in range(self.horizon): # change for np.random.uniform actions[h, :, :] = np.random.uniform( self.env.action_space.low, self.env.action_space.high, (self.num_simulated_paths, self.env.action_space.shape[0])) next_states[h, :, :] = self.dyn_model.predict( current_states[h, :, :], actions[h, :, :]) # change next current state with next states if h < self.horizon - 1: current_states[h + 1, :, :] = next_states[h, :, :] tc = trajectory_cost_fn(self.cost_fn, current_states, actions, next_states) min_cost = np.argmin(tc) return actions[0][min_cost]
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ states = [] actions = [] nxt_states = [] state_batch = np.tile(state, [self.num_simulated_paths, 1]) # copy state for simulated numbers path and make them a fake batch # do the imagination rolling out for j in range(self.horizon): action_batch = [] for i in range(self.num_simulated_paths): # random policy action_batch.append(self.env.action_space.sample()) # choose random actions for next step of 10 simulated paths action_batch = np.asarray(action_batch) nxt_batch = self.dyn_model.predict(state_batch,action_batch) # rollout next batches states.append(state_batch) actions.append(action_batch) nxt_states.append(nxt_batch) state_batch = np.copy(nxt_batch) traj_s=np.asarray(states).transpose(1,0,2) traj_nxt_s = np.asarray(nxt_states).transpose(1,0,2) traj_action = np.asarray(actions).transpose(1,0,2) costs = [] for i in range(traj_s.shape[0]): trajectory_cost = trajectory_cost_fn(cheetah_cost_fn,traj_s[i], traj_action[i],traj_nxt_s[i]) costs.append(trajectory_cost) a_idx = np.argmin(np.array(costs)) return traj_action[a_idx][0]
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ N = self.num_simulated_paths h = self.horizon action_samples = np.random.uniform(self.env.action_space.low, self.env.action_space.high, \ [h, N, *self.env.action_space.shape]) state_samples = [np.tile(state, (N, 1))] for step in range(h): current_state = state_samples[-1] current_action = action_samples[step] state_samples.append( self.dyn_model.predict(current_state, current_action)) next_states = np.swapaxes(np.array(state_samples[1:]), 1, 0) state_samples = np.swapaxes(np.array(state_samples[:-1]), 1, 0) action_samples = np.swapaxes(action_samples, 1, 0) costs = np.array([ trajectory_cost_fn(self.cost_fn, s, a, n) for s, a, n in zip(state_samples, action_samples, next_states) ]) best_index = costs.argmin() best_action = action_samples[best_index][0] best_cost = costs[best_index] return best_action, best_cost
def path_cost(cost_fn, path): costs = [] cum_len = 0 for i in path["ep_lengths"]: costs.append(trajectory_cost_fn(cost_fn, path['observations'][cum_len: cum_len+i], path['actions'][cum_len: cum_len+i], path['next_observations'][cum_len: cum_len+i])) cum_len = cum_len + i return costs
def get_action(self, state): """ YOUR CODE HERE """ # TODO: random select all actions before predict. # states = [] # actions = [] # next_states = [] # states.append(np.repeat([state], self.num_simulated_paths, axis=0)) # for i in range(self.horizon): # path_action = [] # for k in range(self.num_simulated_paths): # path_action.append(self.env.action_space.sample()) # actions.append(path_action) # for i in range(self.horizon): # batch_states = states[i] # batch_actions = actions[i] # batch_next_states = self.dyn_model.predict(batch_states, batch_actions) # states.append(batch_states) # next_states.append(batch_states) # traj_score = trajectory_cost_fn(self.cost_fn, np.array(states), np.array(actions), np.array(next_states)) # ind_ac = np.argmin(traj_score) # return actions[0][ind_ac] paths = {} for k in range(self.num_simulated_paths): paths[k] = {} paths[k]["states"] = [state] paths[k]["actions"] = [] paths[k]["next_states"] = [] for i in range(self.horizon): states = np.concatenate([[paths[k]["states"][i]] for k in range(len(paths))]) actions = [] for k in range(self.num_simulated_paths): act = self.env.action_space.sample() paths[k]["actions"].append(act) actions.append(act) actions = np.array(actions) next_states = self.dyn_model.predict(states, actions) for k in range(self.num_simulated_paths): paths[k]["states"].append(next_states[k]) paths[k]["next_states"].append(next_states[k]) traj_score = [] for k in range(self.num_simulated_paths): traj_score.append( trajectory_cost_fn(self.cost_fn, paths[k]["states"], paths[k]["actions"], paths[k]["next_states"])) traj_score = np.array(traj_score) ind_ac = np.argmin(traj_score) return paths[ind_ac]["actions"][0] """ Note: be careful to batch your simulations through the model for speed """
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ #print("num path" + repr(self.num_simulated_paths)) costs = np.zeros((self.num_simulated_paths)) sample_actions = np.random.uniform( self.env.action_space.low, self.env.action_space.high, (self.num_simulated_paths, self.horizon, self.env.action_space.shape[0])) #print("sample_actions" + repr(sample_actions.shape)) #print("state" + repr(state.shape)) states = np.zeros( (self.num_simulated_paths, self.horizon, state.shape[0])) resulting_states = np.zeros( (self.num_simulated_paths, self.horizon, state.shape[0])) # use dynamics model to associoated simulated rollouts (s_t+1 = f(s_t,a_t) for k in range(self.num_simulated_paths): # for timestep horizons actions = np.copy(sample_actions[k]) #print(k) #for h in range(self.horizon): #print("k sample actions " + repr(actions[h].shape)) #action = actions[h] # get next states curr_states, curr_resulting_states = self.dyn_model.predict( state, actions) #print("resulting_states " + repr(resulting_state.shape)) resulting_states[k] = curr_resulting_states states[k] = curr_states #print("resulting states append" + repr(resulting_state.shape))#state = resulting_state #resulting_state = np.array(resulting_state) # # use cost_fn to evaluate trajectories #print("k_accum states" + repr(states.shape)) #print("k_accum actions" + repr(sample_actions.shape)) #print("k_accum resulting" + repr(resulting_states.shape)) traj_cost = trajectory_cost_fn(self.cost_fn, curr_states, sample_actions[k], curr_resulting_states) costs[k] = (traj_cost) #print("num path" + repr(self.num_simulated_paths))#all_samples(resulting_state) # find best trejectory best_sim_num = np.argmin(costs) best_seq = sample_actions[best_sim_num] # return first action with best trajecory #print("best_seq " + repr(best_sim_num)) action = np.copy(best_seq[0]) #print("best_act " + repr(action.shape)) return action
def path_cost(cost_fn, path): costs = [] acc = 0 for i in path["ep_lens"]: acc_n = acc + i costs.append( trajectory_cost_fn(cost_fn, path['observations'][acc:acc_n], path['actions'][acc:acc_n], path['next_observations'][acc:acc_n])) acc = acc_n return costs
def get_action_mcs(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ exploration = self.sample_random_actions() # get init observations and copy num_simulated_paths times states = np.tile(state, [self.num_simulated_paths, 1]) states_paths_all = [] action_paths = [] states_paths_all.append(states) for i in range(self.horizon): if self.self_exp: actions, _ = self.policy_net.act(states, stochastic=True) else: actions, _ = self.policy_net.act(states, stochastic=False) # actions += np.random.rand(self.num_simulated_paths, self.env.action_space.shape[0]) * (2*self.explore) - self.explore actions = (1 - self.explore ) * actions + self.explore * exploration[i, :, :] states = self.dyn_model.predict(states, actions) # states = self.dyn_model.predict(states, action_paths[i, :, :]) states_paths_all.append(states) action_paths.append(actions) # evaluate trajectories states_paths_all = np.asarray(states_paths_all) action_paths = np.asarray(action_paths) # batch cost function states_paths = states_paths_all[:-1, :, :] states_nxt_paths = states_paths_all[1:, :, :] # print("action_paths: ", action_paths.shape) # print("states_paths: ", states_paths.shape) # print("states_nxt_paths: ", states_nxt_paths.shape) costs = trajectory_cost_fn(self.cost_fn, states_paths, action_paths, states_nxt_paths) min_cost_path = np.argmin(costs) opt_cost = costs[min_cost_path] opt_action_path = action_paths[:, min_cost_path, :] opt_action = copy.copy(opt_action_path[0]) # print("MPC imagine min cost: ", opt_cost) return opt_action
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ acs,ob,obs,costs,next_obs =[],[],[],[],[] [ob.append(state) for path in range(self.num_simulated_paths)] for i in range(self.horizon): ac=[] obs.append(ob) [ac.append(self.env.action_space.sample()) for path in range(self.num_simulated_paths)] acs.append(ac) ob=self.dyn_model.predict(np.array(ob),np.array(ac)) next_obs.append(ob) costs =trajectory_cost_fn(self.cost_fn,np.array(obs),np.array(acs),np.array(next_obs)) j = np.argmin(costs) return acs[0][j]
def get_action(self, state): """ Note: be careful to batch your simulations through the model for speed """ #sampled_acts = np.array([[self.env.action_space.sample() for j in range(self.num_simulated_paths)] for i in range(self.horizon)]) sampled_acts = np.array([[ self.env.action_space.sample() for j in range(self.num_simulated_paths) ] for i in range(self.horizon)]) #sampled_acts=np.random.randint(,size=[self.horizon,self.num_simulated_paths]) states = [np.array([state] * self.num_simulated_paths)] nstates = [] for i in range(self.horizon): nstates.append( self.dyn_model.predict(states[-1], sampled_acts[i, :])) if i < self.horizon: states.append(nstates[-1]) costs = trajectory_cost_fn(self.cost_fn, states, sampled_acts, nstates) return sampled_acts[0][np.argmin(costs)], min(costs)
def get_action(self, state): """ Note: be careful to batch your simulations through the model for speed """ """ – To evaluate the costs of imaginary rollouts, use trajectory_cost_fn, which requires a per-timestep cost_fn as an argument. Notice that the MPC controller gets a cost function as a keyword argument—this is what you should use! – When generating the imaginary rollouts starting from a state s, be efficient and batch the computation. At the first step, broadcast s to have shape (number of fictional rollouts, observation dim), and then use that as an input to the dynamics model prediction to produce the batch of next steps. – The cost functions are also designed for batch computations, so you can feed the whole batch of trajectories at once to trajectory_cost_fn. For details on how, read the code. """ """You shouldn't be using the environment for MPCcontroller's get_action. You should use the dynamics model to predict the next state, instead of using the env to get the actual next state.""" """ YOUR CODE HERE """ ac_dim = self.env.action_space.shape[0] low = self.env.action_space.low[ 0] # assuming all dimensions have same range high = self.env.action_space.high[ 0] # assuming all dimensions have same range states = np.repeat(np.array([[state]]), self.num_simulated_paths, axis=0) actions = np.random.uniform(low=low, high=high, size=(self.num_simulated_paths, self.horizon, ac_dim)) next_s = np.swapaxes( np.array(self.dyn_model.predict(states[:, -1, :], actions[:, 0, :])), 0, 1) states = np.concatenate((states, next_s), axis=1) for t in range(self.horizon - 1): next_s = np.swapaxes( np.array( self.dyn_model.predict(states[:, -1, :], actions[:, t + 1, :])), 0, 1) states = np.concatenate((states, next_s), axis=1) cost_dict = { i: trajectory_cost_fn(self.cost_fn, states[i, :-1, :], actions[i, :, :], states[i, 1:, :]) for i in range(self.num_simulated_paths) } j_star = min(cost_dict, key=cost_dict.get) return actions[j_star, 0, :]
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ # need to be debuged obs, next_obs, acs, costs = [], [], [], [ ] #(horizon, num_simulated_paths, n_dim) ob = [obs.append(state) for _ in range(self.num_simulated_paths)] for _ in range(self.horizon): obs.append(ob) ac = [ acs.append(self.env.action_space.sample()) for _ in range(self.num_simulated_paths) ] acs.append(ac) ob = self.dyn_model.predict(ob, ac) next_obs.append(ob) costs = trajectory_cost_fn(self.cost_fn, obs, acs, next_obs) j = np.argmin(costs, ) # no batch # paths, costs = [], [] # for _ in range(self.num_simulated_paths): # ob = state # obs, next_obs, acs, rewards = [], [], [], [] # steps = 0 # while True: # obs.append(ob) # ac = self.env.action_space.sample() # acs.append(ac) # ob, rew, done, _ = self.dyn_model.predict(ob, ac) # next_obs.append(ob) # rewards.append(rew) # steps += 1 # if done or steps > self.horizon: # break # path = {"state": np.array(obs), # "next_state": np.array(obs), # "reward": np.array(rewards), # "action": np.array(acs)} # paths.append(path) # cost = trajectory_cost_fn(self.cost_fn, path['state'], path['action'], path['next_state']) # costs.append(cost) # j = np.argmin(costs) return acs[0][j]
def get_action(self, state): """ Batches simulations through the model for speed """ sampled_acts = np.array([[ self.env.action_space.sample() for j in range(self.num_simulated_paths) ] for i in range(self.horizon)]) states = [np.array([state] * self.num_simulated_paths)] nstates = [] a = np.array([state]) for i in range(self.horizon): nstates.append( self.dyn_model.predict(states[-1], sampled_acts[i, :])) if i < self.horizon: states.append(nstates[-1]) costs = trajectory_cost_fn(self.cost_fn, states, sampled_acts, nstates) return sampled_acts[0][np.argmin(costs)]
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ s = np.ndarray(shape=(self.horizon, self.num_simulated_paths, self.env.observation_space.shape[0])) s[0, :, :] = state a = np.ndarray(shape=(self.horizon - 1, self.num_simulated_paths, self.env.action_space.shape[0])) upper_bound = self.env.action_space.high lower_bound = self.env.action_space.low for i in range(self.horizon - 1): a[i, :, :] = np.random.uniform( lower_bound, upper_bound, (self.num_simulated_paths, self.env.action_space.shape[0])) s[i + 1, :, :] = self.dyn_model.predict(s[i, :, :], a[i, :, :]) traj_cost = trajectory_cost_fn(self.cost_fn, s[0:-1], a, s[1:]) best_path = np.argmin(traj_cost) return a[0, best_path, :]
def get_action(self, state): self.actions_all = np.random.uniform(self.action_low, self.action_high, \ [self.horizon, self.num_simulated_paths, self.action_len]) self.states_all[ 0, :, :] = state # set initial state for each path to the arg state for i in range(0, self.horizon - 1): self.states_all[i + 1, :, :] = self.dyn_model.predict( self.states_all[i, :, :], self.actions_all[i, :, :]) self.states_next_all[0:-1, :, :] = self.states_all[1:, :, :] self.states_next_all[-1, :, :] = self.dyn_model.predict( self.states_all[-1, :, :], self.actions_all[-1, :, :]) ''' # stupid check HERE states_all_check = np.ones( [self.horizon, self.num_simulated_paths, self.state_len])*np.nan states_next_all_check = np.ones( [self.horizon, self.num_simulated_paths, self.state_len])*np.nan states_all_check[0,:,:] = state for t in range(self.num_simulated_paths): for i in range(0, self.horizon-1): s = states_all_check[i, t, :].reshape([1,states_all_check.shape[2]]) a = self.actions_all[i,t,:].reshape([1, self.actions_all.shape[2]]) states_all_check[i+1, t, :] = self.dyn_model.predict(s,a) states_next_all_check[i,t,:] = states_all_check[i+1, t, :] s = states_all_check[-1, t, :].reshape([1,states_all_check.shape[2]]) a = self.actions_all[-1,t,:].reshape([1, self.actions_all.shape[2]]) states_next_all_check[-1, t, :] = self.dyn_model.predict(s,a) pdb.set_trace() ''' costs_arr = trajectory_cost_fn(self.cost_fn, self.states_all, self.actions_all, \ self.states_next_all) opt_path_ind = np.argmin(costs_arr) act_opt = self.actions_all[0, opt_path_ind, :] pdb.set_trace() return act_opt
def get_action(self, state): """ batch simulations through the model for speed """ rand_cont = RandomController(self.env) obs, actions, next_obs = [],[],[] new_obs = np.repeat(state[np.newaxis,:],self.num_simulated_paths,0) for _ in range(self.horizon): new_actions = [rand_cont.get_action(state) for _ in range(self.num_simulated_paths)] obs.append(new_obs) new_obs = self.dyn_model.predict(obs,actions) next_obs.append(new_obs) actions.append(new_actions) obs,actions,next_obs = np.array(obs),np.array(actions),np.array(next_obs) costs = trajectory_cost_fn(self.cost_fn,obs,actions,next_obs) best_path_id = np.argmin(costs) return actions[0][best_path_id]
def get_action(self, state): ac_dim = self.env.action_space.shape[0] rand_acs = np.random.uniform(low=self.env.action_space.low, high=self.env.action_space.high, size=(self.num_simulated_paths, self.horizon, ac_dim)) paths = [] ob = np.ones((self.num_simulated_paths, len(state))) * state obs, acs, next_obs = [], [], [] for h in range(self.horizon): obs.append(ob) ac = rand_acs[:, h] ob = self.dyn_model.predict(ob, ac) next_obs.append(ob) acs.append(ac) obs, next_obs, acs = np.array(obs), np.array(next_obs), np.array(acs) costs = trajectory_cost_fn(self.cost_fn, obs, acs, next_obs) ind = np.argmin(costs) return acs[:, ind][0]
def get_action(self, state): """ YOUR CODE HERE """ """ Note: be careful to batch your simulations through the model for speed """ states_tab = [] actions_tab = [] next_states_tab = [] current_states = np.array( [state for i in range(self.num_simulated_paths)]) for time_step in range(self.horizon): random_actions = np.random.uniform(low=-1, high=1, size=(self.num_simulated_paths, self.act_dim)) states_tab.append(current_states) actions_tab.append(random_actions) next_states = self.dyn_model.predict(current_states, random_actions) next_states_tab.append(next_states) current_states = next_states actions_trajectories = np.array(actions_tab) states_trajectories = np.array(states_tab) next_states_trajectories = np.array(next_states_tab) # states_trajectories = np.swapaxes(states_trajectories, 0, 1) # actions_trajectories = np.swapaxes(actions_trajectories, 0, 1) # next_states_trajectories = np.swapaxes(next_states_trajectories, 0, 1) trajectory_cost = trajectory_cost_fn(self.cost_fn, states_trajectories, actions_trajectories, next_states_trajectories) best_trajectory_ind = np.argmin(trajectory_cost) best_action = actions_trajectories[0, best_trajectory_ind, :] best_state = states_trajectories[0, best_trajectory_ind, :] best_next_state = next_states_trajectories[0, best_trajectory_ind, :] return (self.cost_fn(best_state, best_action, best_next_state), best_action)
def get_action(self, state): """ Note: be careful to batch your simulations through the model for speed """ # TODO(jalex): Combine starting states tstep_params = { 'states': [], 'actions': [], 'next_states': [], } states = np.repeat(state.reshape((1, -1)), self.num_simulated_paths, axis=0) for t in range(self.horizon): actions = np.array( [self.env.action_space.sample() for _ in states]) states_next = self.dyn_model.predict(states, actions) tstep_params['states'].append(states) tstep_params['actions'].append(actions) tstep_params['next_states'].append(states_next) states = states_next costs = trajectory_cost_fn(self.cost_fn, **tstep_params) cost_argmin = np.argmin(costs) next_action_minimize_horizon_cost = tstep_params['actions'][0][ cost_argmin] return next_action_minimize_horizon_cost
def get_action(self, state): # Note: be careful to batch your simulations through the model for # speed all_states = [] all_actions = [] all_next_states = [] states = np.array([state] * self.num_simulated_paths) for _ in range(self.horizon): actions = np.array([ self.env.action_space.sample() for _ in range(self.num_simulated_paths) ]) next_states = self.dyn_model.predict(states, actions) all_states.append(states) all_actions.append(actions) all_next_states.append(next_states) states = next_states costs = trajectory_cost_fn(self.cost_fn, all_states, all_actions, all_next_states) return all_actions[0][np.argmin(costs)]
def sample(env, controller, num_paths=10, horizon=1000, render=False, verbose=False): """ Write a sampler function which takes in an environment, a controller (either random or the MPC controller), and returns rollouts by running on the env. Each path can have elements for observations, next_observations, rewards, returns, actions, etc. """ paths = [] """ YOUR CODE HERE """ # iterate over the paths for n_path in range(num_paths): ob = env.reset() obs, n_obs, acs, rewards = [], [], [],[] # number of horizons to collect action and states for for n_horz in range(horizon): obs.append(ob) ac = controller.get_action(ob) acs.append(ac) ob, rew, done, _ = env.step(ac) n_obs.append(ob) rewards.append(rew) # collect the data and cost of the path cost = trajectory_cost_fn(cheetah_cost_fn, np.array(obs), np.array(acs), np.array(n_obs)) # add data path = {"observations" : np.array(obs), "next_observations" : np.array(n_obs), "rewards" : np.array(rewards), "actions" : np.array(acs), "costs" : cost} paths.append(path) return paths