Exemple #1
0
    def get_action(self, state):
        obs, obs_list, obs_next_list, act_list = [], [], [], []

        [obs.append(state) for _ in range(self.num_simulated_paths)]

        for _ in range(self.horizon):
            obs_list.append(obs)

            # get random actions
            actions = []
            [
                actions.append(self.env.action_space.sample())
                for _ in range(self.num_simulated_paths)
            ]

            act_list.append(actions)
            obs = self.dyn_model.predict(np.array(obs), np.array(actions))
            obs_next_list.append(obs)

        trajectory_cost_list = trajectory_cost_fn(self.cost_fn,
                                                  np.array(obs_list),
                                                  np.array(act_list),
                                                  np.array(obs_next_list))

        j = np.argmin(trajectory_cost_list)

        return act_list[0][j]
Exemple #2
0
	def get_action(self, state):
		# Broadcast state for batch calculations
		batch_state = np.tile(state,(self.num_simulated_paths,1))
		best_traj = None
		state_traj = batch_state[np.newaxis,:]

		for i in range(self.horizon):
			# Sample actions
			batch_action = self.action_low+np.random.uniform(size=(self.num_simulated_paths,self.ac_dim))*(self.action_high-self.action_low)
			# Simulate dynamics from NN model
			batch_state = self.dyn_model.predict(batch_state,batch_action)
			# Add next state batch to trajectory
			state_traj = np.vstack((state_traj,batch_state[np.newaxis,:]))
			# Add actions to sequence
			if i == 0:
				action_seq = batch_action[np.newaxis,:]
			else:
				action_seq = np.vstack((action_seq,batch_action[np.newaxis,:]))

		s_t_traj = state_traj[0:self.horizon,:,:]
		s_tp1_traj = state_traj[1:self.horizon+1,:,:]
		traj_cost = trajectory_cost_fn(self.cost_fn,s_t_traj,action_seq,s_tp1_traj)
		best_traj = np.argmin(traj_cost)

		# Get best action
		best_action = action_seq[0,best_traj,:].reshape((self.ac_dim))
		return best_action
		""" Note: be careful to batch your simulations through the model for speed """
Exemple #3
0
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        obs = state * np.ones(
            (self.num_simulated_paths, state.shape[0]))  # (paths, obs_dim)
        observations = []  # [(paths, obs_dim), ...]
        actions = []  # [(paths, action_dim), ...]
        next_observations = []  # [(paths, obs_dim), ...]

        for i in range(self.horizon):
            # sample from action candidates (instead of calling env.action_space.sample() every iteration)
            random_idx = np.random.choice(self.action_candidates.shape[0],
                                          obs.shape[0],
                                          replace=False)
            action = self.action_candidates[random_idx]
            #action = np.array([self.env.action_space.sample() for i in range(self.num_simulated_paths)])
            observations += [obs]
            actions += [action]
            obs = self.dyn_model.predict(obs, action)
            next_observations += [obs]

        costs = trajectory_cost_fn(self.cost_fn, observations, actions,
                                   next_observations)  # (paths, )

        return actions[0][np.argmin(costs)]
Exemple #4
0
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        action_paths = self.sample_random_actions()

        # get init observations and copy num_simulated_paths times
        states = np.tile(state, [self.num_simulated_paths, 1])

        states_paths_all = []
        states_paths_all.append(states)

        for i in range(self.horizon):
            states = self.dyn_model.predict(states, action_paths[i, :, :])
            states_paths_all.append(states)

        # evaluate trajectories
        states_paths_all = np.asarray(states_paths_all)

        # batch cost function
        states_paths = states_paths_all[:-1, :, :]
        states_nxt_paths = states_paths_all[1:, :, :]

        costs = trajectory_cost_fn(self.cost_fn, states_paths, action_paths,
                                   states_nxt_paths)

        min_cost_path = np.argmin(costs)
        opt_cost = costs[min_cost_path]
        opt_action_path = action_paths[:, min_cost_path, :]
        opt_action = copy.copy(opt_action_path[0])

        # print("MPC imagine min cost: ", opt_cost)
        return opt_action
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        obs_dim = self.env.observation_space.shape[0]
        a_dim = self.env.action_space.shape[0]

        actions = np.zeros((self.horizon, self.num_simulated_paths, a_dim))
        states = np.zeros((self.horizon, self.num_simulated_paths, obs_dim))
        prime_states = np.zeros(
            (self.horizon, self.num_simulated_paths, obs_dim))

        states[0, :, :] = np.full((self.num_simulated_paths, obs_dim), state)

        for step in range(self.horizon):
            states_t = states[step, :, :]
            a_t = np.array([
                self.env.action_space.sample()
                for _ in range(self.num_simulated_paths)
            ])
            ps_t = self.dyn_model.predict(states_t, a_t)

            actions[step, :, :] = a_t
            prime_states[step, :, :] = ps_t
            if step < self.horizon - 1:
                states[step + 1, :, :] = ps_t

        costs = trajectory_cost_fn(self.cost_fn, states, actions, prime_states)

        best_traj_index = np.argmin(costs, axis=0)

        return actions[0][best_traj_index]
Exemple #6
0
    def get_action(self, state):
        """ Note: be careful to batch your simulations through the model for speed """
        states = np.repeat(np.expand_dims(state, axis=0),
                           self.num_simulated_paths,
                           axis=0)

        def get_rand_actions():
            actions = []
            for i in range(self.num_simulated_paths):
                actions.append(self.env.action_space.sample())
            actions = np.array(actions)
            return actions

        actions = get_rand_actions()
        states_traj = []
        next_states_traj = []
        actions_traj = []

        for i in range(self.horizon):
            next_states = self.dyn_model.predict(states, actions)
            actions_traj.append(actions)
            states_traj.append(states)
            next_states_traj.append(next_states)

            states = next_states
            actions = get_rand_actions()

        states_traj = np.swapaxes(np.array(states_traj), 0, 1)
        actions_traj = np.swapaxes(np.array(actions_traj), 0, 1)
        next_states_traj = np.swapaxes(np.array(next_states_traj), 0, 1)
        cost_val = trajectory_cost_fn(self.cost_fn, states_traj, actions_traj,
                                      next_states_traj)
        #ipdb.set_trace()
        return actions_traj[0, np.argmin(cost_val), :]
    def get_action(self, state):
        """ Note: be careful to batch your simulations through the model for speed """

        state_batch, states, next_states, actions = [], [], [], []

        #state batches have dimension (K, dim(state))
        for _ in range(self.num_simulated_paths):
            state_batch.append(state)

        for _ in range(self.horizon):
            action = []
            for _ in range(self.num_simulated_paths):
                action.append(self.env.action_space.sample())
            actions.append(action)
            states.append(state_batch)
            #use batch for speed
            state_batch = self.dyn_model.predict(np.array(state_batch),
                                                 np.array(action))

            next_states.append(state_batch)

        costs = trajectory_cost_fn(self.cost_fn, np.array(states),
                                   np.array(actions), np.array(next_states))
        j_star = np.argmin(np.array(costs))
        return actions[0][j_star]
Exemple #8
0
 def get_action(self, state):
     """ YOUR CODE HERE """
     """ Note: be careful to batch your simulations through the model for speed """
     action_dim = self.env.action_space.shape[0]
     # return np.zeros(self.env.action_space.shape)
     state_dim = self.env.observation_space.shape[0]
     path_actions = np.zeros(
         [self.horizon, self.num_simulated_paths, action_dim])
     path_states = np.zeros(
         [self.horizon, self.num_simulated_paths, state_dim])
     path_next_states = np.zeros(
         [self.horizon, self.num_simulated_paths, state_dim])
     states = np.ones([self.num_simulated_paths, state_dim
                       ]) * state.reshape([-1, state_dim])
     for i in range(self.horizon):
         path_states[i] = state
         path_actions[i] = np.asarray([
             self.env.action_space.sample()
             for _ in range(self.num_simulated_paths)
         ])
         states = self.dyn_model.predict(states, path_actions[i])
         path_next_states[i] = states
     path_costs = trajectory_cost_fn(self.cost_fn, path_states,
                                     path_actions, path_next_states)
     best = np.argmin(path_costs)
     return path_actions[0, best]
Exemple #9
0
    def get_action(self, state):
        # horizon * num_paths* state_dim or action_dim

        trajs = np.array([
            self.env.action_space.sample()
            for p in range(self.horizon * self.num_simulated_paths)
        ]).reshape((self.horizon, self.num_simulated_paths, -1))
        #print "Trajs dimension", trajs.shape
        #TODO: action_dim #n_paths*horizon*action_dim
        accum_states = np.zeros(
            (self.horizon, self.num_simulated_paths, state.shape[0]))
        accum_next_states = np.zeros(
            (self.horizon, self.num_simulated_paths, state.shape[0]))
        states = np.ones(
            (self.num_simulated_paths, 1)) * state  # n_paths*state_dim
        #print "state", state
        #print "accum_states", accum_states
        #print "accum_states shape", accum_states.shape
        for time_idx in range(self.horizon):
            actions = trajs[time_idx, :, :]  # n_paths * action_dim
            #print "action dimension", actions.shape
            next_states = self.dyn_model.predict(states, actions)
            #print "next states dimension", next_states.shape
            accum_states[time_idx, :, :] = states
            accum_next_states[time_idx, :, :] = next_states
            states = next_states.copy(
            )  #set the states to be next_states for next time_idx
        cost = trajectory_cost_fn(self.cost_fn, accum_states, trajs,
                                  accum_next_states)
        #print "cost shape", cost.shape
        #print "min cost", np.min(cost)
        #print "min cost arg way", cost[np.argmin(cost)]
        action = trajs[0, np.argmin(cost), :]  # first action for min rollout
        return action.flatten()
Exemple #10
0
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        current_states = np.zeros((self.horizon, self.num_simulated_paths,
                                   self.env.observation_space.shape[0]))
        actions = np.zeros((self.horizon, self.num_simulated_paths,
                            self.env.action_space.shape[0]))
        next_states = np.zeros((self.horizon, self.num_simulated_paths,
                                self.env.observation_space.shape[0]))
        current_states[0, :, :] = state
        for h in range(self.horizon):
            # change for np.random.uniform
            actions[h, :, :] = np.random.uniform(
                self.env.action_space.low, self.env.action_space.high,
                (self.num_simulated_paths, self.env.action_space.shape[0]))

            next_states[h, :, :] = self.dyn_model.predict(
                current_states[h, :, :], actions[h, :, :])
            # change next current state with next states
            if h < self.horizon - 1:
                current_states[h + 1, :, :] = next_states[h, :, :]

        tc = trajectory_cost_fn(self.cost_fn, current_states, actions,
                                next_states)
        min_cost = np.argmin(tc)

        return actions[0][min_cost]
Exemple #11
0
	def get_action(self, state):
		""" YOUR CODE HERE """
		""" Note: be careful to batch your simulations through the model for speed """
		states = []
		actions = []
		nxt_states = []
		state_batch = np.tile(state, [self.num_simulated_paths, 1]) # copy state for simulated numbers path and make them a fake batch
		# do the imagination rolling out
		for j in range(self.horizon):
			action_batch = []
			for i in range(self.num_simulated_paths): # random policy 
				action_batch.append(self.env.action_space.sample()) # choose random actions for next step of 10 simulated paths 
			action_batch = np.asarray(action_batch)
			nxt_batch = self.dyn_model.predict(state_batch,action_batch) # rollout next batches
			
			states.append(state_batch)
			actions.append(action_batch)
			nxt_states.append(nxt_batch)
			state_batch = np.copy(nxt_batch)
		
		traj_s=np.asarray(states).transpose(1,0,2)
		traj_nxt_s = np.asarray(nxt_states).transpose(1,0,2)
		traj_action = np.asarray(actions).transpose(1,0,2)
		costs = []
		for i in range(traj_s.shape[0]):
		    trajectory_cost = trajectory_cost_fn(cheetah_cost_fn,traj_s[i], traj_action[i],traj_nxt_s[i])
		    costs.append(trajectory_cost)

		a_idx = np.argmin(np.array(costs))

		return traj_action[a_idx][0]
Exemple #12
0
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        N = self.num_simulated_paths
        h = self.horizon
        action_samples = np.random.uniform(self.env.action_space.low, self.env.action_space.high, \
                                            [h, N, *self.env.action_space.shape])

        state_samples = [np.tile(state, (N, 1))]

        for step in range(h):
            current_state = state_samples[-1]
            current_action = action_samples[step]
            state_samples.append(
                self.dyn_model.predict(current_state, current_action))

        next_states = np.swapaxes(np.array(state_samples[1:]), 1, 0)
        state_samples = np.swapaxes(np.array(state_samples[:-1]), 1, 0)
        action_samples = np.swapaxes(action_samples, 1, 0)

        costs = np.array([
            trajectory_cost_fn(self.cost_fn, s, a, n)
            for s, a, n in zip(state_samples, action_samples, next_states)
        ])
        best_index = costs.argmin()
        best_action = action_samples[best_index][0]
        best_cost = costs[best_index]

        return best_action, best_cost
Exemple #13
0
def path_cost(cost_fn, path):

    costs = []
    cum_len = 0
    for i in path["ep_lengths"]:
        costs.append(trajectory_cost_fn(cost_fn, path['observations'][cum_len: cum_len+i], path['actions'][cum_len: cum_len+i], path['next_observations'][cum_len: cum_len+i]))
        cum_len = cum_len + i
    return costs
    def get_action(self, state):
        """ YOUR CODE HERE """
        # TODO: random select all actions before predict.

        # states = []
        # actions = []
        # next_states = []
        # states.append(np.repeat([state], self.num_simulated_paths, axis=0))
        # for i in range(self.horizon):
        # 	path_action = []
        # 	for k in range(self.num_simulated_paths):
        # 		path_action.append(self.env.action_space.sample())
        # 	actions.append(path_action)

        # for i in range(self.horizon):
        # 	batch_states = states[i]
        # 	batch_actions = actions[i]
        # 	batch_next_states = self.dyn_model.predict(batch_states, batch_actions)
        # 	states.append(batch_states)
        # 	next_states.append(batch_states)
        # traj_score = trajectory_cost_fn(self.cost_fn, np.array(states), np.array(actions), np.array(next_states))
        # ind_ac = np.argmin(traj_score)
        # return actions[0][ind_ac]

        paths = {}
        for k in range(self.num_simulated_paths):
            paths[k] = {}
            paths[k]["states"] = [state]
            paths[k]["actions"] = []
            paths[k]["next_states"] = []

        for i in range(self.horizon):
            states = np.concatenate([[paths[k]["states"][i]]
                                     for k in range(len(paths))])
            actions = []
            for k in range(self.num_simulated_paths):
                act = self.env.action_space.sample()
                paths[k]["actions"].append(act)
                actions.append(act)
            actions = np.array(actions)

            next_states = self.dyn_model.predict(states, actions)
            for k in range(self.num_simulated_paths):
                paths[k]["states"].append(next_states[k])
                paths[k]["next_states"].append(next_states[k])

        traj_score = []
        for k in range(self.num_simulated_paths):
            traj_score.append(
                trajectory_cost_fn(self.cost_fn, paths[k]["states"],
                                   paths[k]["actions"],
                                   paths[k]["next_states"]))
        traj_score = np.array(traj_score)
        ind_ac = np.argmin(traj_score)
        return paths[ind_ac]["actions"][0]
        """ Note: be careful to batch your simulations through the model for speed """
Exemple #15
0
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """

        #print("num path" + repr(self.num_simulated_paths))
        costs = np.zeros((self.num_simulated_paths))

        sample_actions = np.random.uniform(
            self.env.action_space.low, self.env.action_space.high,
            (self.num_simulated_paths, self.horizon,
             self.env.action_space.shape[0]))

        #print("sample_actions" + repr(sample_actions.shape))
        #print("state" + repr(state.shape))

        states = np.zeros(
            (self.num_simulated_paths, self.horizon, state.shape[0]))
        resulting_states = np.zeros(
            (self.num_simulated_paths, self.horizon, state.shape[0]))

        # use dynamics model to associoated simulated rollouts (s_t+1 = f(s_t,a_t)
        for k in range(self.num_simulated_paths):  # for timestep horizons
            actions = np.copy(sample_actions[k])
            #print(k)
            #for h in range(self.horizon):
            #print("k sample actions " + repr(actions[h].shape))
            #action = actions[h]
            # get next states
            curr_states, curr_resulting_states = self.dyn_model.predict(
                state, actions)
            #print("resulting_states " + repr(resulting_state.shape))

            resulting_states[k] = curr_resulting_states
            states[k] = curr_states
            #print("resulting states append" + repr(resulting_state.shape))#state = resulting_state
            #resulting_state = np.array(resulting_state) #

            # use cost_fn to evaluate trajectories
            #print("k_accum states" + repr(states.shape))
            #print("k_accum actions" + repr(sample_actions.shape))
            #print("k_accum resulting" + repr(resulting_states.shape))
            traj_cost = trajectory_cost_fn(self.cost_fn, curr_states,
                                           sample_actions[k],
                                           curr_resulting_states)
            costs[k] = (traj_cost)
            #print("num path" + repr(self.num_simulated_paths))#all_samples(resulting_state)

            # find best trejectory
        best_sim_num = np.argmin(costs)
        best_seq = sample_actions[best_sim_num]
        # return first action with best trajecory
        #print("best_seq " + repr(best_sim_num))
        action = np.copy(best_seq[0])
        #print("best_act " + repr(action.shape))
        return action
def path_cost(cost_fn, path):
    costs = []
    acc = 0
    for i in path["ep_lens"]:
        acc_n = acc + i
        costs.append(
            trajectory_cost_fn(cost_fn, path['observations'][acc:acc_n],
                               path['actions'][acc:acc_n],
                               path['next_observations'][acc:acc_n]))
        acc = acc_n
    return costs
Exemple #17
0
    def get_action_mcs(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        exploration = self.sample_random_actions()

        # get init observations and copy num_simulated_paths times
        states = np.tile(state, [self.num_simulated_paths, 1])

        states_paths_all = []
        action_paths = []
        states_paths_all.append(states)

        for i in range(self.horizon):
            if self.self_exp:
                actions, _ = self.policy_net.act(states, stochastic=True)
            else:
                actions, _ = self.policy_net.act(states, stochastic=False)
                # actions += np.random.rand(self.num_simulated_paths, self.env.action_space.shape[0]) * (2*self.explore) - self.explore

                actions = (1 - self.explore
                           ) * actions + self.explore * exploration[i, :, :]

            states = self.dyn_model.predict(states, actions)

            # states = self.dyn_model.predict(states, action_paths[i, :, :])
            states_paths_all.append(states)
            action_paths.append(actions)

        # evaluate trajectories
        states_paths_all = np.asarray(states_paths_all)
        action_paths = np.asarray(action_paths)

        # batch cost function
        states_paths = states_paths_all[:-1, :, :]
        states_nxt_paths = states_paths_all[1:, :, :]

        # print("action_paths: ", action_paths.shape)
        # print("states_paths: ", states_paths.shape)
        # print("states_nxt_paths: ", states_nxt_paths.shape)

        costs = trajectory_cost_fn(self.cost_fn, states_paths, action_paths,
                                   states_nxt_paths)

        min_cost_path = np.argmin(costs)
        opt_cost = costs[min_cost_path]
        opt_action_path = action_paths[:, min_cost_path, :]
        opt_action = copy.copy(opt_action_path[0])

        # print("MPC imagine min cost: ", opt_cost)
        return opt_action
Exemple #18
0
	def get_action(self, state):
		""" YOUR CODE HERE """
		""" Note: be careful to batch your simulations through the model for speed """
		acs,ob,obs,costs,next_obs =[],[],[],[],[]
		[ob.append(state) for path in range(self.num_simulated_paths)]
		for i in range(self.horizon):
			ac=[]
			obs.append(ob)
			[ac.append(self.env.action_space.sample()) for path in range(self.num_simulated_paths)]
			acs.append(ac)
			ob=self.dyn_model.predict(np.array(ob),np.array(ac))
			next_obs.append(ob)
		costs =trajectory_cost_fn(self.cost_fn,np.array(obs),np.array(acs),np.array(next_obs))
		j = np.argmin(costs)

		return acs[0][j]
Exemple #19
0
 def get_action(self, state):
     """ Note: be careful to batch your simulations through the model for speed """
     #sampled_acts = np.array([[self.env.action_space.sample() for j in range(self.num_simulated_paths)] for i in range(self.horizon)])
     sampled_acts = np.array([[
         self.env.action_space.sample()
         for j in range(self.num_simulated_paths)
     ] for i in range(self.horizon)])
     #sampled_acts=np.random.randint(,size=[self.horizon,self.num_simulated_paths])
     states = [np.array([state] * self.num_simulated_paths)]
     nstates = []
     for i in range(self.horizon):
         nstates.append(
             self.dyn_model.predict(states[-1], sampled_acts[i, :]))
         if i < self.horizon: states.append(nstates[-1])
     costs = trajectory_cost_fn(self.cost_fn, states, sampled_acts, nstates)
     return sampled_acts[0][np.argmin(costs)], min(costs)
Exemple #20
0
    def get_action(self, state):
        """ Note: be careful to batch your simulations through the model for speed """
        """ – To evaluate the costs of imaginary rollouts, use trajectory_cost_fn, which
		requires a per-timestep cost_fn as an argument. Notice that the MPC controller
		gets a cost function as a keyword argument—this is what you should use!
		– When generating the imaginary rollouts starting from a state s, be efficient and
		batch the computation. At the first step, broadcast s to have shape (number of
		fictional rollouts, observation dim), and then use that as an input to the dynamics
		model prediction to produce the batch of next steps.
		– The cost functions are also designed for batch computations, so you can feed the
		whole batch of trajectories at once to trajectory_cost_fn. For details on
		how, read the code. """
        """You shouldn't be using the environment for MPCcontroller's get_action.  
		You should use the dynamics model to predict the next state, instead of using the env 
		to get the actual next state."""
        """ YOUR CODE HERE """
        ac_dim = self.env.action_space.shape[0]
        low = self.env.action_space.low[
            0]  # assuming all dimensions have same range
        high = self.env.action_space.high[
            0]  # assuming all dimensions have same range

        states = np.repeat(np.array([[state]]),
                           self.num_simulated_paths,
                           axis=0)
        actions = np.random.uniform(low=low,
                                    high=high,
                                    size=(self.num_simulated_paths,
                                          self.horizon, ac_dim))
        next_s = np.swapaxes(
            np.array(self.dyn_model.predict(states[:, -1, :],
                                            actions[:, 0, :])), 0, 1)
        states = np.concatenate((states, next_s), axis=1)
        for t in range(self.horizon - 1):
            next_s = np.swapaxes(
                np.array(
                    self.dyn_model.predict(states[:, -1, :],
                                           actions[:, t + 1, :])), 0, 1)
            states = np.concatenate((states, next_s), axis=1)
        cost_dict = {
            i: trajectory_cost_fn(self.cost_fn, states[i, :-1, :],
                                  actions[i, :, :], states[i, 1:, :])
            for i in range(self.num_simulated_paths)
        }
        j_star = min(cost_dict, key=cost_dict.get)
        return actions[j_star, 0, :]
Exemple #21
0
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        # need to be debuged
        obs, next_obs, acs, costs = [], [], [], [
        ]  #(horizon, num_simulated_paths, n_dim)
        ob = [obs.append(state) for _ in range(self.num_simulated_paths)]
        for _ in range(self.horizon):
            obs.append(ob)
            ac = [
                acs.append(self.env.action_space.sample())
                for _ in range(self.num_simulated_paths)
            ]
            acs.append(ac)
            ob = self.dyn_model.predict(ob, ac)
            next_obs.append(ob)
        costs = trajectory_cost_fn(self.cost_fn, obs, acs, next_obs)
        j = np.argmin(costs, )

        # no batch
        # paths, costs = [], []
        # for _ in range(self.num_simulated_paths):
        #     ob = state
        #     obs, next_obs, acs, rewards = [], [], [], []
        #     steps = 0
        #     while True:
        #         obs.append(ob)
        #         ac = self.env.action_space.sample()
        #         acs.append(ac)
        #         ob, rew, done, _ = self.dyn_model.predict(ob, ac)
        #         next_obs.append(ob)
        #         rewards.append(rew)
        #         steps += 1
        #         if done or steps > self.horizon:
        #             break
        #     path = {"state": np.array(obs),
        #             "next_state": np.array(obs),
        #             "reward": np.array(rewards),
        #             "action": np.array(acs)}
        #     paths.append(path)
        #     cost = trajectory_cost_fn(self.cost_fn, path['state'], path['action'], path['next_state'])
        #     costs.append(cost)
        # j = np.argmin(costs)

        return acs[0][j]
Exemple #22
0
    def get_action(self, state):
        """ Batches simulations through the model for speed """
        sampled_acts = np.array([[
            self.env.action_space.sample()
            for j in range(self.num_simulated_paths)
        ] for i in range(self.horizon)])
        states = [np.array([state] * self.num_simulated_paths)]
        nstates = []
        a = np.array([state])

        for i in range(self.horizon):
            nstates.append(
                self.dyn_model.predict(states[-1], sampled_acts[i, :]))

            if i < self.horizon: states.append(nstates[-1])
        costs = trajectory_cost_fn(self.cost_fn, states, sampled_acts, nstates)

        return sampled_acts[0][np.argmin(costs)]
Exemple #23
0
 def get_action(self, state):
     """ YOUR CODE HERE """
     """ Note: be careful to batch your simulations through the model for speed """
     s = np.ndarray(shape=(self.horizon, self.num_simulated_paths,
                           self.env.observation_space.shape[0]))
     s[0, :, :] = state
     a = np.ndarray(shape=(self.horizon - 1, self.num_simulated_paths,
                           self.env.action_space.shape[0]))
     upper_bound = self.env.action_space.high
     lower_bound = self.env.action_space.low
     for i in range(self.horizon - 1):
         a[i, :, :] = np.random.uniform(
             lower_bound, upper_bound,
             (self.num_simulated_paths, self.env.action_space.shape[0]))
         s[i + 1, :, :] = self.dyn_model.predict(s[i, :, :], a[i, :, :])
     traj_cost = trajectory_cost_fn(self.cost_fn, s[0:-1], a, s[1:])
     best_path = np.argmin(traj_cost)
     return a[0, best_path, :]
    def get_action(self, state):

        self.actions_all = np.random.uniform(self.action_low, self.action_high, \
         [self.horizon, self.num_simulated_paths, self.action_len])

        self.states_all[
            0, :, :] = state  # set initial state for each path to the arg state

        for i in range(0, self.horizon - 1):
            self.states_all[i + 1, :, :] = self.dyn_model.predict(
                self.states_all[i, :, :], self.actions_all[i, :, :])

        self.states_next_all[0:-1, :, :] = self.states_all[1:, :, :]
        self.states_next_all[-1, :, :] = self.dyn_model.predict(
            self.states_all[-1, :, :], self.actions_all[-1, :, :])
        '''
		# stupid check HERE
		states_all_check = np.ones( [self.horizon, self.num_simulated_paths, self.state_len])*np.nan
		states_next_all_check = np.ones( [self.horizon, self.num_simulated_paths, self.state_len])*np.nan

		states_all_check[0,:,:] = state

		for t in range(self.num_simulated_paths):
			for i in range(0, self.horizon-1):
				s = states_all_check[i, t, :].reshape([1,states_all_check.shape[2]])
				a = self.actions_all[i,t,:].reshape([1, self.actions_all.shape[2]])
				states_all_check[i+1, t, :] = self.dyn_model.predict(s,a)
				states_next_all_check[i,t,:] = states_all_check[i+1, t, :]

			s = states_all_check[-1, t, :].reshape([1,states_all_check.shape[2]])
			a = self.actions_all[-1,t,:].reshape([1, self.actions_all.shape[2]])
			states_next_all_check[-1, t, :] = self.dyn_model.predict(s,a)

		pdb.set_trace()
		'''

        costs_arr = trajectory_cost_fn(self.cost_fn, self.states_all, self.actions_all, \
               self.states_next_all)

        opt_path_ind = np.argmin(costs_arr)
        act_opt = self.actions_all[0, opt_path_ind, :]
        pdb.set_trace()
        return act_opt
Exemple #25
0
	def get_action(self, state):
		""" batch simulations through the model for speed """
		rand_cont = RandomController(self.env)

		obs, actions, next_obs = [],[],[]
		new_obs = np.repeat(state[np.newaxis,:],self.num_simulated_paths,0)

		for _ in range(self.horizon):
			new_actions = [rand_cont.get_action(state) for _ in range(self.num_simulated_paths)]
			obs.append(new_obs)
			new_obs = self.dyn_model.predict(obs,actions)
			next_obs.append(new_obs)
			actions.append(new_actions)

		obs,actions,next_obs = np.array(obs),np.array(actions),np.array(next_obs)
		costs = trajectory_cost_fn(self.cost_fn,obs,actions,next_obs)

		best_path_id = np.argmin(costs)
		return actions[0][best_path_id]
Exemple #26
0
    def get_action(self, state):
        ac_dim = self.env.action_space.shape[0]
        rand_acs = np.random.uniform(low=self.env.action_space.low,
                                     high=self.env.action_space.high,
                                     size=(self.num_simulated_paths,
                                           self.horizon, ac_dim))

        paths = []
        ob = np.ones((self.num_simulated_paths, len(state))) * state
        obs, acs, next_obs = [], [], []
        for h in range(self.horizon):
            obs.append(ob)
            ac = rand_acs[:, h]
            ob = self.dyn_model.predict(ob, ac)
            next_obs.append(ob)
            acs.append(ac)
        obs, next_obs, acs = np.array(obs), np.array(next_obs), np.array(acs)
        costs = trajectory_cost_fn(self.cost_fn, obs, acs, next_obs)
        ind = np.argmin(costs)
        return acs[:, ind][0]
Exemple #27
0
    def get_action(self, state):
        """ YOUR CODE HERE """
        """ Note: be careful to batch your simulations through the model for speed """
        states_tab = []
        actions_tab = []
        next_states_tab = []
        current_states = np.array(
            [state for i in range(self.num_simulated_paths)])
        for time_step in range(self.horizon):
            random_actions = np.random.uniform(low=-1,
                                               high=1,
                                               size=(self.num_simulated_paths,
                                                     self.act_dim))
            states_tab.append(current_states)
            actions_tab.append(random_actions)
            next_states = self.dyn_model.predict(current_states,
                                                 random_actions)
            next_states_tab.append(next_states)
            current_states = next_states

        actions_trajectories = np.array(actions_tab)
        states_trajectories = np.array(states_tab)
        next_states_trajectories = np.array(next_states_tab)

        # states_trajectories = np.swapaxes(states_trajectories, 0, 1)
        # actions_trajectories = np.swapaxes(actions_trajectories, 0, 1)
        # next_states_trajectories = np.swapaxes(next_states_trajectories, 0, 1)

        trajectory_cost = trajectory_cost_fn(self.cost_fn, states_trajectories,
                                             actions_trajectories,
                                             next_states_trajectories)

        best_trajectory_ind = np.argmin(trajectory_cost)
        best_action = actions_trajectories[0, best_trajectory_ind, :]
        best_state = states_trajectories[0, best_trajectory_ind, :]
        best_next_state = next_states_trajectories[0, best_trajectory_ind, :]

        return (self.cost_fn(best_state, best_action,
                             best_next_state), best_action)
Exemple #28
0
    def get_action(self, state):
        """ Note: be careful to batch your simulations through the model for speed """

        # TODO(jalex): Combine starting states

        tstep_params = {
            'states': [],
            'actions': [],
            'next_states': [],
        }

        states = np.repeat(state.reshape((1, -1)),
                           self.num_simulated_paths,
                           axis=0)

        for t in range(self.horizon):

            actions = np.array(
                [self.env.action_space.sample() for _ in states])

            states_next = self.dyn_model.predict(states, actions)

            tstep_params['states'].append(states)
            tstep_params['actions'].append(actions)
            tstep_params['next_states'].append(states_next)

            states = states_next

        costs = trajectory_cost_fn(self.cost_fn, **tstep_params)

        cost_argmin = np.argmin(costs)

        next_action_minimize_horizon_cost = tstep_params['actions'][0][
            cost_argmin]

        return next_action_minimize_horizon_cost
Exemple #29
0
    def get_action(self, state):
        # Note: be careful to batch your simulations through the model for
        # speed
        all_states = []
        all_actions = []
        all_next_states = []

        states = np.array([state] * self.num_simulated_paths)
        for _ in range(self.horizon):
            actions = np.array([
                self.env.action_space.sample()
                for _ in range(self.num_simulated_paths)
            ])
            next_states = self.dyn_model.predict(states, actions)

            all_states.append(states)
            all_actions.append(actions)
            all_next_states.append(next_states)

            states = next_states

        costs = trajectory_cost_fn(self.cost_fn, all_states, all_actions,
                                   all_next_states)
        return all_actions[0][np.argmin(costs)]
Exemple #30
0
def sample(env, 
           controller, 
           num_paths=10, 
           horizon=1000, 
           render=False,
           verbose=False):
    """
        Write a sampler function which takes in an environment, a controller (either random or the MPC controller), 
        and returns rollouts by running on the env. 
        Each path can have elements for observations, next_observations, rewards, returns, actions, etc.
    """
    paths = []
    """ YOUR CODE HERE """
    # iterate over the paths
    for n_path in range(num_paths):
        ob = env.reset()
        obs, n_obs, acs, rewards = [], [], [],[]
        # number of horizons to collect action and states for
        for n_horz in range(horizon):
            obs.append(ob)
            ac = controller.get_action(ob)
            acs.append(ac)
            ob, rew, done, _ = env.step(ac)
            n_obs.append(ob)
            rewards.append(rew)
        # collect the data and cost of the path
        cost = trajectory_cost_fn(cheetah_cost_fn, np.array(obs), np.array(acs), np.array(n_obs))
        # add data
        path = {"observations" : np.array(obs), 
                "next_observations" : np.array(n_obs),
                "rewards" : np.array(rewards), 
                "actions" : np.array(acs),
                "costs" : cost}
        paths.append(path)

    return paths