Ejemplo n.º 1
0
def render_test(torque_type=0):
    '''
    torque_types:
        - 0 : square wave torque to make the pendulum oscillate back and forth
        - 1 : some constant value torque
        - 2 (or anything else) : random torque
    '''

    env = PendulumEnv()
    env.reset()

    at_rest = True

    try:
        for _ in range(500):
            env.render()

            if torque_type == 0:

                if env.state[0] == env.angle_limit and at_rest:
                    val = env.max_torque
                    at_rest = False

                elif env.state[0] == -env.angle_limit and at_rest:
                    val = -env.max_torque
                    at_rest = False

                if abs(env.state[0]) == env.angle_limit and not at_rest:
                    at_rest = True

                u = np.array([val]).astype(env.action_space.dtype)

            elif torque_type == 1:
                val = -43
                u = np.array([val]).astype(env.action_space.dtype)

            else:
                u = env.action_space.sample()

            info = env.step(u)
            # print(info)
            # print(env.state[0]) # print angular position
            print(env.state[1])  # print angular velocity

            time.sleep(.1)

    except KeyboardInterrupt:
        pass

    env.close()
Ejemplo n.º 2
0
    return K

# create environment
env = PendulumEnv()

# reset environment
state = env.reset()
done = False

# Get linearized dynamics
A,B = env._linearize(np.pi)

# create cost matrices for LQR
Q = np.array([[1,0], [0,10]])
R = np.array([[0.001]])

# Compute gain matrix K
K = lqr(A,B,Q,R)

# Run environment
i = 0
while not done:
    env.render()
    if i >= 1:
        action = -np.matmul(K, state.reshape(2) - np.array([np.pi, 0]))
    else:
        action = -np.matmul(K, state - np.array([np.pi, 0]))
    state, _, done, _ = env.step(action)
    i += 1
env.close()
Ejemplo n.º 3
0
class QLearning():
    def __init__(self):
        self.env = PendulumEnv()
        self.env.reset()
        self.action_number = 101
        self.state_space = 2
        self.epsilon = 0.2
        self.bias = 0
        self.gamma = 0.99
        min_torque = -self.env.max_torque
        max_torque = self.env.max_torque
        self.actions = []
        for i in range(self.action_number):
            self.actions.append(min_torque + (max_torque - min_torque) /
                                (self.action_number - 1) * i)
        print(self.actions)
        self.weight_matrix = np.zeros((len(self.actions), self.state_space))

    def train(self, episodes=10, max_iterarions=20000, learning_rate=0.01):
        for episodes_number in range(episodes):
            total_reward = 0
            curr_state_info = self.env.reset()
            curr_state = np.zeros(self.state_space)
            for i in range(self.state_space):
                curr_state[i] = curr_state_info[i]

            #print (curr_state)
            for iteration_number in range(max_iterarions):
                time.sleep(0.1)
                print(curr_state)
                random_float = random.random()
                if (random_float < self.epsilon):
                    action = np.random.randint(0, self.action_number - 1)
                else:
                    action = -1
                    max_q_s_a_w = -sys.float_info.max
                    for action_iter in range(self.action_number):
                        q_s_a_w_iter = np.dot(
                            curr_state,
                            self.weight_matrix[action_iter]) + self.bias
                        if (q_s_a_w_iter > max_q_s_a_w):
                            max_q_s_a_w = q_s_a_w_iter
                            action = action_iter

                u = np.array([self.actions[action]
                              ]).astype(self.env.action_space.dtype)
                #u = np.array([500]).astype(self.env.action_space.dtype)

                next_state_info, reward, isDone, _ = self.env.step(u)
                print("reward : ", reward)
                print(self.actions[action])
                print("")

                next_state = np.zeros((self.state_space))
                for i in range(self.state_space):
                    next_state[i] = float(next_state_info[i])

                max_q_s_a_w = -sys.float_info.max
                for action_iter in range(self.action_number):
                    q_s_a_w_iter = np.dot(
                        next_state,
                        self.weight_matrix[action_iter]) + self.bias
                    if (q_s_a_w_iter > max_q_s_a_w):
                        max_q_s_a_w = q_s_a_w_iter

                gradient_matrix_w = np.zeros(
                    (self.action_number, self.state_space))
                gradient_matrix_w[action] = curr_state

                copy_of_weight_matrix = copy.deepcopy(self.weight_matrix)
                copy_of_bias = copy.deepcopy(self.bias)

                self.weight_matrix = self.weight_matrix - learning_rate * (
                    (np.dot(curr_state, copy_of_weight_matrix[action]) +
                     copy_of_bias) -
                    (reward + self.gamma * max_q_s_a_w)) * gradient_matrix_w
                self.bias = self.bias - learning_rate * (
                    (np.dot(curr_state, copy_of_weight_matrix[action]) +
                     copy_of_bias) - (reward + self.gamma * max_q_s_a_w)) * 1.0

                curr_state = next_state
                total_reward += reward

                if (True or episodes_number % 100 == 0):
                    self.env.render()

        self.env.close()
class Simulator():

    def __init__(self, data_dictionary=None, policy_name=None, policy_directory=None):

        self.save_directory = 'rory_data'

        self.trainer = QLearning()
        self.data = data_dictionary

        self.num_actions = self.trainer.num_avail_actions
        self.num_positions = self.trainer.num_avail_positions
        self.num_velocities = self.trainer.num_avail_velocities

        if data_dictionary is None:
            if policy_directory is None:
                self.load_directory = self.trainer.save_directory
            else:
                self.load_directory = policy_directory

            if policy_name is None:
                self.policy_name = self.grab_newest_policy()
            else:
                self.policy_name = policy_name

            # goal_theta_num, goal_theta_den = self.get_goal_theta(self.policy_name)

            # self.goal_theta = goal_theta_num / goal_theta_den
            self.goal_theta = np.pi / 4

            self.file = os.path.join(self.load_directory, self.policy_name)
            self.policy = self.load_policy()
            self.data = dict()
            

        else:
            self.goal_theta = self.data['goal_theta']
            self.policy = self.data['policy']
            self.file = self.data['fname']
            self.policy_name = self.file

        self.env = PendulumEnv(goal_theta=self.goal_theta)

        self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_positions)
        self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_velocities)
        self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_actions)

        self.dummy_q = self.trainer.q_matrix
        self.num_useful_actions = 0

        self.torques = []
        self.theta_errors = []

    
    def load_policy(self):
        policy = np.load(self.file, allow_pickle=True)
        policy = policy.item().get('policy')
        return policy

    
    def grab_newest_policy(self):
        all_policies = os.listdir(self.load_directory)

        file_count = 0
        theta_dict = {}
        name_dict = {}

        for i, policy in enumerate(all_policies):

            if not policy.startswith('.'): # ignore .DS files, its a Mac thing
                fname, _ = os.path.splitext(policy)
                
                try:
                    gtheta = '_'.join(fname.split('_')[-2:])
                    fname = '_'.join(fname.split('_')[:-2])
                    time_components = np.array(list(map(int, fname.split('_'))))
                    file_count += 1
                except ValueError:
                    continue

                theta_dict[i] = gtheta

                if file_count == 1:
                    name_array = time_components
                else:
                    name_array = np.row_stack((name_array, time_components))

                name_dict[fname] = i

        while len(name_array.shape) > 1:
            col_idx_diff =  np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0]
            row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze()
            name_array = name_array[row_idx_curr_max, :]
        
        newest_policy = name_array
        newest_policy = '_'.join(map(str, newest_policy))
        suffix_theta = theta_dict[name_dict[newest_policy]]
        newest_policy += '_' + suffix_theta + '.npy'
            
        return newest_policy
    

    def get_goal_theta(self,pol_name):
        # same exact logic as get_newest_policy() except it just grabs the goal theta
        fname, _ = os.path.splitext(pol_name)
        len_fname = len(fname) - 1

        if 'pi' in fname:
            idx_pi = fname.find('pi')
            
            if idx_pi != len_fname: # index of numerator
                num = np.pi
            else: # index of denominator
                den = np.pi

            fname = fname.replace('pi','555')
        
        if 'ip' in fname:
            idx_ip = fname.find('ip')

            if idx_ip != len_fname: # index of numerator
                num = -np.pi
            else: # index of denominator
                den = -np.pi

            fname = fname.replace('ip','555')
                       
        time_components = np.array(list(map(int, fname.split('_'))))
        name_array = time_components

        while len(name_array.shape) > 1:
            col_idx_diff =  np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0]
            row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze()
            name_array = name_array[row_idx_curr_max, :]
        
        newest_policy = name_array
        temp_num = newest_policy[-2]
        temp_den = newest_policy[-1]

        if temp_num != 555:
            num = temp_num
        if temp_den != 555:
            den = temp_den

        return num, den


    def getQMatrixIdx(self, th, thdot, torque):
        thIdx = np.abs(self.thetas - th).argmin()
        thdotIdx = np.abs(self.theta_dots - thdot).argmin()
        torIdx = np.abs(self.actions - torque).argmin()

        return torIdx, thIdx, thdotIdx


    def getMaxQValue(self, th, thdot):
        # returns the depth index for given th,thdot state where torque is highest
        maxQValIdx = self.policy[:, th, thdot].argmax()
        maxQVal = self.policy[maxQValIdx, th, thdot]

        return maxQValIdx, maxQVal


    def get_action(self, th, thdot):
        _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0)
        chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx)

        action = self.actions[chosen_idx]
        u = np.array([action]).astype(self.env.action_space.dtype)

        return u

    
    def save_precious_simulated_data(self):
        self.data["simulated_episodes"] = self.num_episodes
        self.data["simulated_iterations"] = self.num_iterations
        self.data["avg_sim_cost"] = self.avg_cost
        self.data["perc_useful_actions"] = self.num_useful_actions
        self.data["torque_arr"] = self.torques
        self.data["theta_error_arr"] = self.theta_errors

        fname = self.policy_name

        save_path = os.path.join(self.save_directory, fname)

        if not os.path.exists(self.save_directory):
            os.mkdir(self.save_directory)

        np.save(save_path, self.data)

        print(f'saved precious data: {fname}')


    def update_dummy_q(self,torI,thI,thDotI):
        if self.dummy_q[torI,thI,thDotI] != 1000:
            self.dummy_q[torI,thI,thDotI] = 1000

    
    def simulate(self, ep_num=500, iter_num=150, start_pos=None, start_vel=None):
        print(f'Running simulation using policy: {self.file}')

        self.num_episodes = ep_num
        self.num_iterations = iter_num
        total_total_cost = 0

        try:

            for i in range(self.num_episodes):

                th, thdot = self.env.reset()

                if start_pos is not None:
                    self.env.state[0] = start_pos
                    th = start_pos
                
                if start_vel is not None:
                    self.env.state[1] = start_vel
                    thdot = start_vel

                for _ in range(self.num_iterations):

                    self.env.render()

                    self.theta_errors.append(self.goal_theta - th)

                    u = self.get_action(th, thdot)
                    self.torques.append(u)

                    torIdx,thIdx,thDotIdx = self.getQMatrixIdx(th, thdot, u)

                    self.update_dummy_q(torIdx,thIdx,thDotIdx)

                    nextTh, nextThdot, reward = self.env.step(u)
                    total_total_cost += reward

                    th = nextTh
                    thdot = nextThdot
                
                    time.sleep(.05)

                if i != self.num_episodes-1:
                    self.torques = []
                    self.theta_errors = []
        
        except KeyboardInterrupt:
            pass

        self.avg_cost = total_total_cost / self.num_episodes
        self.num_useful_actions = (self.dummy_q == 1000).sum() / self.dummy_q.size
        
        self.env.close()