def __init__(self, goal_theta_num=0, goal_theta_den=1): self.goal_theta_num = goal_theta_num self.goal_theta_den = goal_theta_den self.goal_theta = goal_theta_num / goal_theta_den self.env = PendulumEnv(goal_theta=self.goal_theta) self.save_directory = 'saved_policies' self.epsilon = .2 self.gamma = .99 self.num_avail_actions = 31 self.num_avail_positions = 51 self.num_avail_velocities = 51 self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_avail_positions) self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_avail_velocities) self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_avail_actions) self.q_matrix = np.zeros(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) self.converge_threshold = 0.05 # % of q-value that dq must be close to to be considered converged self.perc_conv_thresh = 0.8 # % of q-values that must pass the convergence threshold self.percent_converged = 0 self.percent_unexplored = 0 self.prev_q_matrix = np.zeros(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) # previous q-matrix self.dq_matrix = 100 * np.ones(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) # delta-q matrix, tracks amount each weight is being updated self.data = dict() self.start_time = 0 self.total_time = 0 self.ep_rewards = [] self.perc_unexplored_arr = [] self.perc_conv_arr = []
def __init__(self, data_dictionary=None, policy_name=None, policy_directory=None): self.save_directory = 'rory_data' self.trainer = QLearning() self.data = data_dictionary self.num_actions = self.trainer.num_avail_actions self.num_positions = self.trainer.num_avail_positions self.num_velocities = self.trainer.num_avail_velocities if data_dictionary is None: if policy_directory is None: self.load_directory = self.trainer.save_directory else: self.load_directory = policy_directory if policy_name is None: self.policy_name = self.grab_newest_policy() else: self.policy_name = policy_name # goal_theta_num, goal_theta_den = self.get_goal_theta(self.policy_name) # self.goal_theta = goal_theta_num / goal_theta_den self.goal_theta = np.pi / 4 self.file = os.path.join(self.load_directory, self.policy_name) self.policy = self.load_policy() self.data = dict() else: self.goal_theta = self.data['goal_theta'] self.policy = self.data['policy'] self.file = self.data['fname'] self.policy_name = self.file self.env = PendulumEnv(goal_theta=self.goal_theta) self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_positions) self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_velocities) self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_actions) self.dummy_q = self.trainer.q_matrix self.num_useful_actions = 0 self.torques = [] self.theta_errors = []
def __init__(self): self.env = PendulumEnv() self.env.reset() self.action_number = 101 self.state_space = 2 self.epsilon = 0.2 self.bias = 0 self.gamma = 0.99 min_torque = -self.env.max_torque max_torque = self.env.max_torque self.actions = [] for i in range(self.action_number): self.actions.append(min_torque + (max_torque - min_torque) / (self.action_number - 1) * i) print(self.actions) self.weight_matrix = np.zeros((len(self.actions), self.state_space))
def render_test(torque_type=0): ''' torque_types: - 0 : square wave torque to make the pendulum oscillate back and forth - 1 : some constant value torque - 2 (or anything else) : random torque ''' env = PendulumEnv() env.reset() at_rest = True try: for _ in range(500): env.render() if torque_type == 0: if env.state[0] == env.angle_limit and at_rest: val = env.max_torque at_rest = False elif env.state[0] == -env.angle_limit and at_rest: val = -env.max_torque at_rest = False if abs(env.state[0]) == env.angle_limit and not at_rest: at_rest = True u = np.array([val]).astype(env.action_space.dtype) elif torque_type == 1: val = -43 u = np.array([val]).astype(env.action_space.dtype) else: u = env.action_space.sample() info = env.step(u) # print(info) # print(env.state[0]) # print angular position print(env.state[1]) # print angular velocity time.sleep(.1) except KeyboardInterrupt: pass env.close()
def main(): dummy_env = PendulumEnv() start_pos = dummy_env.angle_limit pol_dir = 'saved_policies' fname = 'interp316_pi84.npy' sim = Simulator(policy_directory=pol_dir, policy_name=fname) sim.simulate(ep_num=1, iter_num=200, start_pos=start_pos, start_vel=0) sim.save_precious_simulated_data()
def __init__(self, netsize, Nsensors=1, Nmotors=1): # Create ising model self.size = netsize #Network size self.Ssize = Nsensors # Number of sensors self.Msize = Nmotors # Number of sensors self.h = np.zeros(netsize) self.J = np.zeros((netsize, netsize)) self.max_weights = 2 self.randomize_state() self.env = PendulumEnv() self.observation = self.env.reset() self.Beta = 1.0 self.defaultT = max(100, netsize * 20) self.Ssize1 = 0 self.maxspeed = self.env.max_speed self.Update(-1)
class QLearning(): def __init__(self, goal_theta_num=0, goal_theta_den=1): self.goal_theta_num = goal_theta_num self.goal_theta_den = goal_theta_den self.goal_theta = goal_theta_num / goal_theta_den self.env = PendulumEnv(goal_theta=self.goal_theta) self.save_directory = 'saved_policies' self.epsilon = .2 self.gamma = .99 self.num_avail_actions = 31 self.num_avail_positions = 51 self.num_avail_velocities = 51 self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_avail_positions) self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_avail_velocities) self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_avail_actions) self.q_matrix = np.zeros(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) self.converge_threshold = 0.05 # % of q-value that dq must be close to to be considered converged self.perc_conv_thresh = 0.8 # % of q-values that must pass the convergence threshold self.percent_converged = 0 self.percent_unexplored = 0 self.prev_q_matrix = np.zeros(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) # previous q-matrix self.dq_matrix = 100 * np.ones(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) # delta-q matrix, tracks amount each weight is being updated self.data = dict() self.start_time = 0 self.total_time = 0 self.ep_rewards = [] self.perc_unexplored_arr = [] self.perc_conv_arr = [] def getQMatrixIdx(self, th, thdot, torque): thIdx = np.abs(self.thetas - th).argmin() thdotIdx = np.abs(self.theta_dots - thdot).argmin() torIdx = np.abs(self.actions - torque).argmin() return torIdx, thIdx, thdotIdx def getMaxQValue(self, th, thdot): # returns the depth index for given th,thdot state where torque is highest maxQValIdx = self.q_matrix[:, th, thdot].argmax() maxQVal = self.q_matrix[maxQValIdx, th, thdot] return maxQValIdx, maxQVal def get_action(self, th, thdot): random_float = random.random() if (random_float < self.epsilon): # if a random float is less than our epsilon, explore a random action chosen_idx = np.random.randint(0, self.num_avail_actions) else: # if the random float is not less than epsilon, exploit the policy _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0) chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx) action = self.actions[chosen_idx] u = np.array([action]).astype(self.env.action_space.dtype) return u def check_converged(self): # total number of elements total_elements = self.q_matrix.size percent_changed = np.divide(self.dq_matrix, self.q_matrix, out=np.ones(self.q_matrix.shape), where=self.q_matrix != 0) # compute the number of 'converged' q-values num_converged = (percent_changed < self.converge_threshold).sum() # percentage of converged q-values self.percent_converged = num_converged / total_elements self.percent_unexplored = (self.dq_matrix == 100).sum() / self.dq_matrix.size if self.percent_converged >= self.perc_conv_thresh: return True else: return False def get_convergence_stats(self): # returns a dictionary where the keys are threshold values (from [0-0.95] in 0.05 step size) # and the values are the percentage of the q-matrix that meets this threshold of convergence conv_stats = dict() total_elements = self.q_matrix.size conv_threshold = 0.05 for i in range(20): new_threshold = (conv_threshold * i) percent_changed = np.divide(self.dq_matrix, self.q_matrix, out=np.ones(self.q_matrix.shape), where=self.q_matrix != 0) # compute the number of 'converged' q-values num_converged = (percent_changed < new_threshold).sum() # percentage of converged q-values percent_converged = num_converged / total_elements conv_stats[new_threshold] = percent_converged return conv_stats def train(self, episodes=15000, max_iterations=100000, l_rate=0.1): self.start_time = time.time() for episode_num in range(episodes): self.episodes = episode_num self.iterations = max_iterations self.l_rate = l_rate # reset the environment and declare th,thdot th, thdot = self.env.reset() iter_count = -1 total_reward = 0 while(not self.env.is_done and iter_count < max_iterations): iter_count += 1 # select a new action to take u = self.get_action(th, thdot) # find the current indecies in the self.weights_matrix so that we can update the weight for this action : th,thdot,u currTorIdx, currThIdx, currThdotIdx = self.getQMatrixIdx(th, thdot, u) # find next state corresponding to chosen action nextTh, nextThdot, reward = self.env.step(u) total_reward += reward _, nextThIdx, nextThdotIdx = self.getQMatrixIdx(nextTh, nextThdot, u) # find the highest weighted torque in the self.weights_matrix given the nextTh,nextThdot _, nextQVal = self.getMaxQValue(nextThIdx, nextThdotIdx) self.q_matrix[currTorIdx, currThIdx, currThdotIdx] += l_rate * (reward + self.gamma * nextQVal \ - self.q_matrix[currTorIdx, currThIdx, currThdotIdx]) self.dq_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx] \ - self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx] self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx] th = nextTh thdot = nextThdot if iter_count % 100 == 0: # self.env.render() print('iter_count = ', iter_count) print('episode = ', episode_num) print('epsilon = ', self.epsilon) print(f'percent converged = {round(self.percent_converged, 2)}') print(f'percent unexplored = {round(self.percent_unexplored, 2)}') print("Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time))) print('') converged = self.check_converged() if converged: print(f'Converged on episode {episode_num}') break if episode_num % 10 == 0: self.ep_rewards.append(total_reward) self.perc_unexplored_arr.append(self.percent_unexplored) self.perc_conv_arr.append(self.percent_converged) self.print_stuff() self.save_policy() self.get_precious_data() def increase_epsilon_maybe(self, ep_num): if ep_num % 1000 == 0 and ep_num != 0: self.epsilon -= .006 def save_policy(self): time_struct = time.localtime(time.time()) fname = self.get_fname(time_struct) self.data['fname'] = fname save_path = os.path.join(self.save_directory, fname) np.save(save_path, self.q_matrix) print(f'saved policy: {fname}') def get_fname(self, time_params): year = time_params.tm_year month = time_params.tm_mon day = time_params.tm_mday hour = time_params.tm_hour minute = time_params.tm_min sec = time_params.tm_sec if isclose(self.goal_theta_num, np.pi, abs_tol=1e-7): num = 'pi' elif isclose(self.goal_theta_num, -np.pi, abs_tol=1e-7): num = 'ip' else: num = self.goal_theta_num if isclose(self.goal_theta_den, np.pi, abs_tol=1e-7): den = 'pi' elif isclose(self.goal_theta_den, -np.pi, abs_tol=1e-7): den = 'ip' else: den = self.goal_theta_den fname = f'{year}_{month}_{day}_{hour}_{minute}_{sec}_{num}_{den}' return fname def print_stuff(self): print("Training Done") print("Total Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time))) print(f'percent converged = {round(self.percent_converged, 2)}') print(f'percent unexplored = {round(self.percent_unexplored, 2)}') def get_precious_data(self): self.data["gamma"] = self.gamma self.data["epsilon"] = self.epsilon self.data["goal_theta"] = self.goal_theta self.data["perc_actions_explored"] = 1 - self.percent_unexplored self.data["perc_converged"] = self.percent_converged self.data["conv_perc_req"] = self.perc_conv_thresh # % of q-values that must pass the convergence threshold self.data["converge_threshold"] = self.converge_threshold # % of q-value that dq must be close to to be considered converged self.data["training_time"] = self.total_time self.data["training_episodes"] = self.episodes self.data["training_iterations"] = self.iterations self.data["learning_rate"] = self.l_rate self.data["policy"] = self.q_matrix self.data["converged_stats"] = self.get_convergence_stats() self.data["ep_rewards_arr"] = self.ep_rewards self.data["perc_unexplored_arr"] = self.perc_unexplored_arr self.data["perc_converged_arr"] = self.perc_conv_arr
import numpy as np from pendulumAgent import DankAgent import datetime import time import matplotlib.pyplot as plt def print_timestamp(string=""): now = datetime.datetime.now() print(string + now.strftime("%Y-%m-%d %H:%M")) ####### INTIALISATION ########################################################## weights_file = "network.h5" load_existent_model = False env = PendulumEnv() env.cnt += 1 agent = DankAgent([-env.max_torque, env.max_torque]) agent.model.summary() if load_existent_model: agent.load(weights_file) nepisodes = 0 ######## CONSTANTS ############################################################ #def: 50000 MEMORY_SIZE = 80000 memory = [] #def: 30000 TRAINING_EPISODES = 30 AUTO_SAVER = 50 SHOW_PROGRESS = 10
#!/usr/bin/python3 import matplotlib.pyplot as plt import gym import sys from pendulum import PendulumEnv, angle_normalize import random from random import randrange import numpy as np from pendulumAgent import DankAgent import datetime import time ####### INTIALISATION ########################################################## weights_file = "network.h5" env = PendulumEnv() env.cnt += 1 state = env.reset() state = state.reshape((1, 3)) action_dim = 25 action_high = env.action_space.high[0] action_low = env.action_space.low[0] action_sequence = np.linspace(action_low, action_high, action_dim) agent = DankAgent(env.observation_space.shape[0], action_dim) agent.model.summary() nepisodes = 0 batch = [] TRAINING_EPISODES = 500 steps = 200
import numpy as np from pendulumAgent import DankAgent import datetime import time from keras.models import load_model from pendulumAgent import DankAgent, print_timestamp import argparse import sys #sys.path.append('..') parser = argparse.ArgumentParser() import random import datetime env = PendulumEnv() env.cnt += 1 input_shape = 3 batch_size = 64 test_agent = DankAgent([-env.max_torque,env.max_torque],input_shape,batch_size) test_agent.load('network_vanilla.h5') list_episode_reward = [] TESTING_EPISODES = 10 for ep in range(TESTING_EPISODES): state = env.reset() episode_reward = 0 for t in range(500): env.render()
plt.xlabel("Episode") plt.ylabel("Episode Reward") plt.title("Evaluation - Episode Reward over Time") fig2.savefig('new_results/' + 'evaluation_reward.png') if noshow: plt.close(fig2) else: plt.show(fig2) if __name__ == "__main__": stats = [] reward_func = True env = PendulumEnv(reward_function = reward_func) state_space_size = env.observation_space.shape[0] action_space_size = env.action_space.shape[0] action_bound = 2 max_time_per_episode = 200 num_episodes = 5000 discount_factor = 0.99 batch_size = 64 tau = 0.001 train = 10 stats = [] actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_space_size)) replay_memory = ReplayBuffer(num_episodes)
class QLearning(): def __init__(self): self.env = PendulumEnv() self.env.reset() self.action_number = 101 self.state_space = 2 self.epsilon = 0.2 self.bias = 0 self.gamma = 0.99 min_torque = -self.env.max_torque max_torque = self.env.max_torque self.actions = [] for i in range(self.action_number): self.actions.append(min_torque + (max_torque - min_torque) / (self.action_number - 1) * i) print(self.actions) self.weight_matrix = np.zeros((len(self.actions), self.state_space)) def train(self, episodes=10, max_iterarions=20000, learning_rate=0.01): for episodes_number in range(episodes): total_reward = 0 curr_state_info = self.env.reset() curr_state = np.zeros(self.state_space) for i in range(self.state_space): curr_state[i] = curr_state_info[i] #print (curr_state) for iteration_number in range(max_iterarions): time.sleep(0.1) print(curr_state) random_float = random.random() if (random_float < self.epsilon): action = np.random.randint(0, self.action_number - 1) else: action = -1 max_q_s_a_w = -sys.float_info.max for action_iter in range(self.action_number): q_s_a_w_iter = np.dot( curr_state, self.weight_matrix[action_iter]) + self.bias if (q_s_a_w_iter > max_q_s_a_w): max_q_s_a_w = q_s_a_w_iter action = action_iter u = np.array([self.actions[action] ]).astype(self.env.action_space.dtype) #u = np.array([500]).astype(self.env.action_space.dtype) next_state_info, reward, isDone, _ = self.env.step(u) print("reward : ", reward) print(self.actions[action]) print("") next_state = np.zeros((self.state_space)) for i in range(self.state_space): next_state[i] = float(next_state_info[i]) max_q_s_a_w = -sys.float_info.max for action_iter in range(self.action_number): q_s_a_w_iter = np.dot( next_state, self.weight_matrix[action_iter]) + self.bias if (q_s_a_w_iter > max_q_s_a_w): max_q_s_a_w = q_s_a_w_iter gradient_matrix_w = np.zeros( (self.action_number, self.state_space)) gradient_matrix_w[action] = curr_state copy_of_weight_matrix = copy.deepcopy(self.weight_matrix) copy_of_bias = copy.deepcopy(self.bias) self.weight_matrix = self.weight_matrix - learning_rate * ( (np.dot(curr_state, copy_of_weight_matrix[action]) + copy_of_bias) - (reward + self.gamma * max_q_s_a_w)) * gradient_matrix_w self.bias = self.bias - learning_rate * ( (np.dot(curr_state, copy_of_weight_matrix[action]) + copy_of_bias) - (reward + self.gamma * max_q_s_a_w)) * 1.0 curr_state = next_state total_reward += reward if (True or episodes_number % 100 == 0): self.env.render() self.env.close()
class ising: # Initialize the network def __init__(self, netsize, Nsensors=1, Nmotors=1): # Create ising model self.size = netsize #Network size self.Ssize = Nsensors # Number of sensors self.Msize = Nmotors # Number of sensors self.h = np.zeros(netsize) self.J = np.zeros((netsize, netsize)) self.max_weights = 2 self.randomize_state() self.env = PendulumEnv() self.observation = self.env.reset() self.Beta = 1.0 self.defaultT = max(100, netsize * 20) self.Ssize1 = 0 self.maxspeed = self.env.max_speed self.Update(-1) def get_state(self, mode='all'): if mode == 'all': return self.s elif mode == 'motors': return self.s[-self.Msize:] elif mode == 'sensors': return self.s[0:self.Ssize] elif mode == 'input': return self.sensors elif mode == 'non-sensors': return self.s[self.Ssize:] elif mode == 'hidden': return self.s[self.Ssize:-self.Msize] def get_state_index(self, mode='all'): return bool2int(0.5 * (self.get_state(mode) + 1)) # Randomize the state of the network def randomize_state(self): self.s = np.random.randint(0, 2, self.size) * 2 - 1 self.sensors = np.random.randint(0, 2, self.Ssize) * 2 - 1 # Randomize the position of the agent def randomize_position(self): self.observation = self.env.reset() # Set random bias to sets of units of the system def random_fields(self, max_weights=None): if max_weights is None: max_weights = self.max_weights self.h[self.Ssize:] = max_weights * \ (np.random.rand(self.size - self.Ssize) * 2 - 1) # Set random connections to sets of units of the system def random_wiring(self, max_weights=None): # Set random values for h and J if max_weights is None: max_weights = self.max_weights for i in range(self.size): for j in np.arange(i + 1, self.size): if i < j and (i >= self.Ssize or j >= self.Ssize): self.J[i, j] = (np.random.rand(1) * 2 - 1) * self.max_weights # Update the position of the agent def Move(self): self.previous_speed = self.observation[1] action = np.sum(self.s[-self.Msize:]) / self.Msize observation, reward, done, info = self.env.step(action) self.observation = self.env.state self.positionX = np.cos(self.env.state[0]) * self.env.l self.positionY = np.sin(self.env.state[0]) * self.env.l self.speed = self.env.state[1] # Transorm the sensor input into integer index def SensorIndex(self, x, xmax): return int( np.floor((x + xmax) / (2 * xmax + 10 * np.finfo(float).eps) * 2**self.Ssize)) # Update the state of the sensor def UpdateSensors(self): self.speed_ind = self.SensorIndex(self.speed, self.maxspeed) self.sensors = 2 * bitfield(self.speed_ind, self.Ssize) - 1 # Execute step of the Glauber algorithm to update the state of one unit def GlauberStep(self, i=None): if i is None: i = np.random.randint(self.size) I = 0 if i < self.Ssize: I = self.sensors[i] eDiff = 2 * self.s[i] * (self.h[i] + I + np.dot(self.J[i, :] + self.J[:, i], self.s)) if eDiff * self.Beta < np.log(1 / np.random.rand() - 1): # Glauber self.s[i] = -self.s[i] # Update random unit of the agent def Update(self, i=None): if i is None: i = np.random.randint(-1, self.size) if i == -1: self.Move() self.UpdateSensors() else: self.GlauberStep(i) # Sequentially update state of all units of the agent in random order def SequentialUpdate(self): for i in np.random.permutation(range(-1, self.size)): self.Update(i) # Step of the learning algorith to ajust correlations to the critical regime def AdjustCorrelations(self, T=None): if T is None: T = self.defaultT self.m = np.zeros(self.size) self.c = np.zeros((self.size, self.size)) self.C = np.zeros((self.size, self.size)) # Main simulation loop: samples = [] for t in range(T): self.SequentialUpdate() #self.x[t] = self.position self.m += self.s for i in range(self.size): self.c[i, i + 1:] += self.s[i] * self.s[i + 1:] self.m /= T self.c /= T for i in range(self.size): self.C[i, i + 1:] = self.c[i, i + 1:] - self.m[i] * self.m[i + 1:] c1 = np.zeros((self.size, self.size)) for i in range(self.size): inds = np.array([], int) c = np.array([]) for j in range(self.size): if not i == j: inds = np.append(inds, [j]) if i < j: c = np.append(c, [self.c[i, j]]) elif i > j: c = np.append(c, [self.c[j, i]]) order = np.argsort(c)[::-1] c1[i, inds[order]] = self.Cint[i, :] self.c1 = np.triu(c1 + c1.T, 1) self.c1 *= 0.5 self.m[0:self.Ssize] = 0 self.m1[0:self.Ssize] = 0 self.c[0:self.Ssize, 0:self.Ssize] = 0 self.c[-self.Msize:, -self.Msize:] = 0 self.c[0:self.Ssize, -self.Msize:] = 0 self.c1[0:self.Ssize, 0:self.Ssize] = 0 self.c1[-self.Msize:, -self.Msize:] = 0 self.c1[0:self.Ssize, -self.Msize:] = 0 dh = self.m1 - self.m dJ = self.c1 - self.c #print(self.m1,self.m) #print("dh, dJ:",dh,",",dJ) return dh, dJ # Algorithm for poising an agent in a critical regime def CriticalLearning(self, Iterations, T=None): u = 0.01 count = 0 dh, dJ = self.AdjustCorrelations(T) fit = max(np.max(np.abs(self.c1 - self.c)), np.max(np.abs(self.m1 - self.m))) for it in range(Iterations): count += 1 self.h += u * dh self.J += u * dJ if it % 10 == 0: self.randomize_state() self.randomize_position() Vmax = self.max_weights for i in range(self.size): if np.abs(self.h[i]) > Vmax: self.h[i] = Vmax * np.sign(self.h[i]) for j in np.arange(i + 1, self.size): if np.abs(self.J[i, j]) > Vmax: self.J[i, j] = Vmax * np.sign(self.J[i, j]) dh, dJ = self.AdjustCorrelations(T) fit = np.max(np.abs(self.c1 - self.c))
class Simulator(): def __init__(self, data_dictionary=None, policy_name=None, policy_directory=None): self.save_directory = 'rory_data' self.trainer = QLearning() self.data = data_dictionary self.num_actions = self.trainer.num_avail_actions self.num_positions = self.trainer.num_avail_positions self.num_velocities = self.trainer.num_avail_velocities if data_dictionary is None: if policy_directory is None: self.load_directory = self.trainer.save_directory else: self.load_directory = policy_directory if policy_name is None: self.policy_name = self.grab_newest_policy() else: self.policy_name = policy_name # goal_theta_num, goal_theta_den = self.get_goal_theta(self.policy_name) # self.goal_theta = goal_theta_num / goal_theta_den self.goal_theta = np.pi / 4 self.file = os.path.join(self.load_directory, self.policy_name) self.policy = self.load_policy() self.data = dict() else: self.goal_theta = self.data['goal_theta'] self.policy = self.data['policy'] self.file = self.data['fname'] self.policy_name = self.file self.env = PendulumEnv(goal_theta=self.goal_theta) self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_positions) self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_velocities) self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_actions) self.dummy_q = self.trainer.q_matrix self.num_useful_actions = 0 self.torques = [] self.theta_errors = [] def load_policy(self): policy = np.load(self.file, allow_pickle=True) policy = policy.item().get('policy') return policy def grab_newest_policy(self): all_policies = os.listdir(self.load_directory) file_count = 0 theta_dict = {} name_dict = {} for i, policy in enumerate(all_policies): if not policy.startswith('.'): # ignore .DS files, its a Mac thing fname, _ = os.path.splitext(policy) try: gtheta = '_'.join(fname.split('_')[-2:]) fname = '_'.join(fname.split('_')[:-2]) time_components = np.array(list(map(int, fname.split('_')))) file_count += 1 except ValueError: continue theta_dict[i] = gtheta if file_count == 1: name_array = time_components else: name_array = np.row_stack((name_array, time_components)) name_dict[fname] = i while len(name_array.shape) > 1: col_idx_diff = np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0] row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze() name_array = name_array[row_idx_curr_max, :] newest_policy = name_array newest_policy = '_'.join(map(str, newest_policy)) suffix_theta = theta_dict[name_dict[newest_policy]] newest_policy += '_' + suffix_theta + '.npy' return newest_policy def get_goal_theta(self,pol_name): # same exact logic as get_newest_policy() except it just grabs the goal theta fname, _ = os.path.splitext(pol_name) len_fname = len(fname) - 1 if 'pi' in fname: idx_pi = fname.find('pi') if idx_pi != len_fname: # index of numerator num = np.pi else: # index of denominator den = np.pi fname = fname.replace('pi','555') if 'ip' in fname: idx_ip = fname.find('ip') if idx_ip != len_fname: # index of numerator num = -np.pi else: # index of denominator den = -np.pi fname = fname.replace('ip','555') time_components = np.array(list(map(int, fname.split('_')))) name_array = time_components while len(name_array.shape) > 1: col_idx_diff = np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0] row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze() name_array = name_array[row_idx_curr_max, :] newest_policy = name_array temp_num = newest_policy[-2] temp_den = newest_policy[-1] if temp_num != 555: num = temp_num if temp_den != 555: den = temp_den return num, den def getQMatrixIdx(self, th, thdot, torque): thIdx = np.abs(self.thetas - th).argmin() thdotIdx = np.abs(self.theta_dots - thdot).argmin() torIdx = np.abs(self.actions - torque).argmin() return torIdx, thIdx, thdotIdx def getMaxQValue(self, th, thdot): # returns the depth index for given th,thdot state where torque is highest maxQValIdx = self.policy[:, th, thdot].argmax() maxQVal = self.policy[maxQValIdx, th, thdot] return maxQValIdx, maxQVal def get_action(self, th, thdot): _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0) chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx) action = self.actions[chosen_idx] u = np.array([action]).astype(self.env.action_space.dtype) return u def save_precious_simulated_data(self): self.data["simulated_episodes"] = self.num_episodes self.data["simulated_iterations"] = self.num_iterations self.data["avg_sim_cost"] = self.avg_cost self.data["perc_useful_actions"] = self.num_useful_actions self.data["torque_arr"] = self.torques self.data["theta_error_arr"] = self.theta_errors fname = self.policy_name save_path = os.path.join(self.save_directory, fname) if not os.path.exists(self.save_directory): os.mkdir(self.save_directory) np.save(save_path, self.data) print(f'saved precious data: {fname}') def update_dummy_q(self,torI,thI,thDotI): if self.dummy_q[torI,thI,thDotI] != 1000: self.dummy_q[torI,thI,thDotI] = 1000 def simulate(self, ep_num=500, iter_num=150, start_pos=None, start_vel=None): print(f'Running simulation using policy: {self.file}') self.num_episodes = ep_num self.num_iterations = iter_num total_total_cost = 0 try: for i in range(self.num_episodes): th, thdot = self.env.reset() if start_pos is not None: self.env.state[0] = start_pos th = start_pos if start_vel is not None: self.env.state[1] = start_vel thdot = start_vel for _ in range(self.num_iterations): self.env.render() self.theta_errors.append(self.goal_theta - th) u = self.get_action(th, thdot) self.torques.append(u) torIdx,thIdx,thDotIdx = self.getQMatrixIdx(th, thdot, u) self.update_dummy_q(torIdx,thIdx,thDotIdx) nextTh, nextThdot, reward = self.env.step(u) total_total_cost += reward th = nextTh thdot = nextThdot time.sleep(.05) if i != self.num_episodes-1: self.torques = [] self.theta_errors = [] except KeyboardInterrupt: pass self.avg_cost = total_total_cost / self.num_episodes self.num_useful_actions = (self.dummy_q == 1000).sum() / self.dummy_q.size self.env.close()
from pendulum import PendulumEnv from scipy.linalg import solve_continuous_are as riccati def lqr(A, B, Q, R, N=None): # Solve Algebraic Riccati Equation P = riccati(A, B, Q, R, e=N) R_inv = np.linalg.inv(R) if N: K = np.dot(R_inv, np.dot(B.T, P) + N.T) else: K = np.dot(R_inv, np.dot(B.T, P)) return K # create environment env = PendulumEnv() # reset environment state = env.reset() done = False # Get linearized dynamics A,B = env._linearize(np.pi) # create cost matrices for LQR Q = np.array([[1,0], [0,10]]) R = np.array([[0.001]]) # Compute gain matrix K K = lqr(A,B,Q,R)
N_rk4 = 10 # RK4 steps dt = 0.1 #delta time s N = 500 # horizon length gamma = 0.99 costs = [] acc_costs = [] u_opt_f = [] th_opt_f = [] thdot_opt_f = [] g_1_f = [] g_2_f = [] iter_f = [] iter_total = 0 with contextlib.closing(PendulumEnv(N_rk4=N_rk4, DT=dt)) as env: s = env.reset(fixed=True) #fixed start at pi,0 x0bar = np.array(s) q = 0 for i in range(N): env.render() u_opt, th_opt, thdot_opt, g_1, g_2, l, iter_count = simultaneous_opt( s, q, env, N - i, gamma) s, c, d, _ = env.step(u_opt[0], noise=True) #state noise q = c + gamma * q u_opt_f.append(u_opt[0]) th_opt_f.append(th_opt[0]) thdot_opt_f.append(thdot_opt[0]) costs.append(c) #gamma is implicit g_1_f.append(g_1[0]) g_2_f.append(g_2[0])
import sys from pendulum import PendulumEnv from Actor_Network_TRPO import * from Critic_Network_TRPO import * from utility import * if __name__ == "__main__": print("start") env = PendulumEnv() action_space = np.arange(-2, 2.01, 0.01) num_actions = len(action_space) action_dim = 1 action_bound = env.action_space.high state_dim = 3 batch_size = 32 learning_rate = 0.001 delta = 0.01 discount_factor = 0.99 num_episodes = 10 len_episode = 100 epsilon = 0.1 load = False if not load: g_stat = [] config = tf.ConfigProto() config.intra_op_parallelism_threads = 8 config.gpu_options.per_process_gpu_memory_fraction = 0.33 with tf.Session(config=config) as sess: