def render_test(torque_type=0): ''' torque_types: - 0 : square wave torque to make the pendulum oscillate back and forth - 1 : some constant value torque - 2 (or anything else) : random torque ''' env = PendulumEnv() env.reset() at_rest = True try: for _ in range(500): env.render() if torque_type == 0: if env.state[0] == env.angle_limit and at_rest: val = env.max_torque at_rest = False elif env.state[0] == -env.angle_limit and at_rest: val = -env.max_torque at_rest = False if abs(env.state[0]) == env.angle_limit and not at_rest: at_rest = True u = np.array([val]).astype(env.action_space.dtype) elif torque_type == 1: val = -43 u = np.array([val]).astype(env.action_space.dtype) else: u = env.action_space.sample() info = env.step(u) # print(info) # print(env.state[0]) # print angular position print(env.state[1]) # print angular velocity time.sleep(.1) except KeyboardInterrupt: pass env.close()
return K # create environment env = PendulumEnv() # reset environment state = env.reset() done = False # Get linearized dynamics A,B = env._linearize(np.pi) # create cost matrices for LQR Q = np.array([[1,0], [0,10]]) R = np.array([[0.001]]) # Compute gain matrix K K = lqr(A,B,Q,R) # Run environment i = 0 while not done: env.render() if i >= 1: action = -np.matmul(K, state.reshape(2) - np.array([np.pi, 0])) else: action = -np.matmul(K, state - np.array([np.pi, 0])) state, _, done, _ = env.step(action) i += 1 env.close()
class QLearning(): def __init__(self, goal_theta_num=0, goal_theta_den=1): self.goal_theta_num = goal_theta_num self.goal_theta_den = goal_theta_den self.goal_theta = goal_theta_num / goal_theta_den self.env = PendulumEnv(goal_theta=self.goal_theta) self.save_directory = 'saved_policies' self.epsilon = .2 self.gamma = .99 self.num_avail_actions = 31 self.num_avail_positions = 51 self.num_avail_velocities = 51 self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_avail_positions) self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_avail_velocities) self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_avail_actions) self.q_matrix = np.zeros(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) self.converge_threshold = 0.05 # % of q-value that dq must be close to to be considered converged self.perc_conv_thresh = 0.8 # % of q-values that must pass the convergence threshold self.percent_converged = 0 self.percent_unexplored = 0 self.prev_q_matrix = np.zeros(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) # previous q-matrix self.dq_matrix = 100 * np.ones(( self.num_avail_actions, self.num_avail_positions, self.num_avail_velocities )) # delta-q matrix, tracks amount each weight is being updated self.data = dict() self.start_time = 0 self.total_time = 0 self.ep_rewards = [] self.perc_unexplored_arr = [] self.perc_conv_arr = [] def getQMatrixIdx(self, th, thdot, torque): thIdx = np.abs(self.thetas - th).argmin() thdotIdx = np.abs(self.theta_dots - thdot).argmin() torIdx = np.abs(self.actions - torque).argmin() return torIdx, thIdx, thdotIdx def getMaxQValue(self, th, thdot): # returns the depth index for given th,thdot state where torque is highest maxQValIdx = self.q_matrix[:, th, thdot].argmax() maxQVal = self.q_matrix[maxQValIdx, th, thdot] return maxQValIdx, maxQVal def get_action(self, th, thdot): random_float = random.random() if (random_float < self.epsilon): # if a random float is less than our epsilon, explore a random action chosen_idx = np.random.randint(0, self.num_avail_actions) else: # if the random float is not less than epsilon, exploit the policy _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0) chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx) action = self.actions[chosen_idx] u = np.array([action]).astype(self.env.action_space.dtype) return u def check_converged(self): # total number of elements total_elements = self.q_matrix.size percent_changed = np.divide(self.dq_matrix, self.q_matrix, out=np.ones(self.q_matrix.shape), where=self.q_matrix != 0) # compute the number of 'converged' q-values num_converged = (percent_changed < self.converge_threshold).sum() # percentage of converged q-values self.percent_converged = num_converged / total_elements self.percent_unexplored = (self.dq_matrix == 100).sum() / self.dq_matrix.size if self.percent_converged >= self.perc_conv_thresh: return True else: return False def get_convergence_stats(self): # returns a dictionary where the keys are threshold values (from [0-0.95] in 0.05 step size) # and the values are the percentage of the q-matrix that meets this threshold of convergence conv_stats = dict() total_elements = self.q_matrix.size conv_threshold = 0.05 for i in range(20): new_threshold = (conv_threshold * i) percent_changed = np.divide(self.dq_matrix, self.q_matrix, out=np.ones(self.q_matrix.shape), where=self.q_matrix != 0) # compute the number of 'converged' q-values num_converged = (percent_changed < new_threshold).sum() # percentage of converged q-values percent_converged = num_converged / total_elements conv_stats[new_threshold] = percent_converged return conv_stats def train(self, episodes=15000, max_iterations=100000, l_rate=0.1): self.start_time = time.time() for episode_num in range(episodes): self.episodes = episode_num self.iterations = max_iterations self.l_rate = l_rate # reset the environment and declare th,thdot th, thdot = self.env.reset() iter_count = -1 total_reward = 0 while(not self.env.is_done and iter_count < max_iterations): iter_count += 1 # select a new action to take u = self.get_action(th, thdot) # find the current indecies in the self.weights_matrix so that we can update the weight for this action : th,thdot,u currTorIdx, currThIdx, currThdotIdx = self.getQMatrixIdx(th, thdot, u) # find next state corresponding to chosen action nextTh, nextThdot, reward = self.env.step(u) total_reward += reward _, nextThIdx, nextThdotIdx = self.getQMatrixIdx(nextTh, nextThdot, u) # find the highest weighted torque in the self.weights_matrix given the nextTh,nextThdot _, nextQVal = self.getMaxQValue(nextThIdx, nextThdotIdx) self.q_matrix[currTorIdx, currThIdx, currThdotIdx] += l_rate * (reward + self.gamma * nextQVal \ - self.q_matrix[currTorIdx, currThIdx, currThdotIdx]) self.dq_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx] \ - self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx] self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx] th = nextTh thdot = nextThdot if iter_count % 100 == 0: # self.env.render() print('iter_count = ', iter_count) print('episode = ', episode_num) print('epsilon = ', self.epsilon) print(f'percent converged = {round(self.percent_converged, 2)}') print(f'percent unexplored = {round(self.percent_unexplored, 2)}') print("Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time))) print('') converged = self.check_converged() if converged: print(f'Converged on episode {episode_num}') break if episode_num % 10 == 0: self.ep_rewards.append(total_reward) self.perc_unexplored_arr.append(self.percent_unexplored) self.perc_conv_arr.append(self.percent_converged) self.print_stuff() self.save_policy() self.get_precious_data() def increase_epsilon_maybe(self, ep_num): if ep_num % 1000 == 0 and ep_num != 0: self.epsilon -= .006 def save_policy(self): time_struct = time.localtime(time.time()) fname = self.get_fname(time_struct) self.data['fname'] = fname save_path = os.path.join(self.save_directory, fname) np.save(save_path, self.q_matrix) print(f'saved policy: {fname}') def get_fname(self, time_params): year = time_params.tm_year month = time_params.tm_mon day = time_params.tm_mday hour = time_params.tm_hour minute = time_params.tm_min sec = time_params.tm_sec if isclose(self.goal_theta_num, np.pi, abs_tol=1e-7): num = 'pi' elif isclose(self.goal_theta_num, -np.pi, abs_tol=1e-7): num = 'ip' else: num = self.goal_theta_num if isclose(self.goal_theta_den, np.pi, abs_tol=1e-7): den = 'pi' elif isclose(self.goal_theta_den, -np.pi, abs_tol=1e-7): den = 'ip' else: den = self.goal_theta_den fname = f'{year}_{month}_{day}_{hour}_{minute}_{sec}_{num}_{den}' return fname def print_stuff(self): print("Training Done") print("Total Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time))) print(f'percent converged = {round(self.percent_converged, 2)}') print(f'percent unexplored = {round(self.percent_unexplored, 2)}') def get_precious_data(self): self.data["gamma"] = self.gamma self.data["epsilon"] = self.epsilon self.data["goal_theta"] = self.goal_theta self.data["perc_actions_explored"] = 1 - self.percent_unexplored self.data["perc_converged"] = self.percent_converged self.data["conv_perc_req"] = self.perc_conv_thresh # % of q-values that must pass the convergence threshold self.data["converge_threshold"] = self.converge_threshold # % of q-value that dq must be close to to be considered converged self.data["training_time"] = self.total_time self.data["training_episodes"] = self.episodes self.data["training_iterations"] = self.iterations self.data["learning_rate"] = self.l_rate self.data["policy"] = self.q_matrix self.data["converged_stats"] = self.get_convergence_stats() self.data["ep_rewards_arr"] = self.ep_rewards self.data["perc_unexplored_arr"] = self.perc_unexplored_arr self.data["perc_converged_arr"] = self.perc_conv_arr
terminal[i] = self.terminal.take(index, axis=0, mode='wrap') return state, action, next_state, reward, terminal for ep in range(TRAINING_EPISODES): if ep % AUTO_SAVER == 0 and nepisodes != 0: print_timestamp("saved") agent.save(weights_file) state = env.reset() state = np.array((state[0], state[2])) state = np.round(state, 2) for t in range(TIMESTEPS): env.render() action = random.choice(agent.discrete_actions) next_state, reward, done, _ = env.step(action, state[1]) next_state = np.array((next_state[0], next_state[2])) next_state = np.round(next_state, 2) if len(memory) == MEMORY_SIZE: memory.pop(0) memory.append((state, action, reward, next_state, done)) batch = random.choice(memory) # print("real angle: {}".format(next_state[0])) # print("real vel: {}".format(next_state[1])) # next_state[0], next_state[1] = discretizeStateSpace(np.round(next_state[0],2),np.round(next_state[1],2)) # next_state = np.round(next_state,2)
state = env.reset() state = state.reshape((1, 3)) episode_reward = 0 if (ep + 1) % TEST_PROGRESS == 0 and ep != 0: avg_reward_list.append(test_run(agent)) for step in range(steps): #if ep % SHOW_PROGRESS == 0 and ep != 0: if False: env.render() action = agent.act(state, False) else: action = agent.act(state, True) next_state, reward, done, _ = env.step(action_sequence[action]) next_state = next_state.reshape((1, 3)) agent.memory_store(state, action, reward, next_state, done) if (ep > 15): sample_index = np.random.choice(agent.memory_size, size=agent.batch_size) batch = agent.memory[sample_index, :] agent.train(batch) if agent.epsilon > agent.epsilon_min: agent.epsilon -= 0.00001 state = next_state episode_reward += reward
import sys #sys.path.append('..') parser = argparse.ArgumentParser() import random import datetime env = PendulumEnv() env.cnt += 1 input_shape = 3 batch_size = 64 test_agent = DankAgent([-env.max_torque,env.max_torque],input_shape,batch_size) test_agent.load('network_vanilla.h5') list_episode_reward = [] TESTING_EPISODES = 10 for ep in range(TESTING_EPISODES): state = env.reset() episode_reward = 0 for t in range(500): env.render() action = test_agent.act(state, False) next_state, reward, done , _ = env.step(test_agent.discrete_actions[action]) state = next_state episode_reward += reward
print("Ended at : ", datetime.datetime.now().strftime("%H:%M:%S")) plot_episode_stats(stats, title = "Training", noshow = True) start_time = time.time() print("Starting at : ", datetime.datetime.now().strftime("%H:%M:%S")) evaluation = 15000 evaluation_stats = EpisodeStats(episode_q_values=np.zeros(evaluation), episode_rewards=np.zeros(evaluation), episode_loss=np.zeros(evaluation)) for e in range(evaluation): print("Evaluation phase ",e, " starts....") state = env.reset() for _ in range(max_time_per_episode): #env.render() action = actor.predict(sess, np.reshape(state, (1, 3))) next_state, reward, _, _ = env.step(action) state = next_state evaluation_stats.episode_rewards[e] += reward #rewards.append(reward) #all_rewards.append(rewards) plot_episode_stats(evaluation_stats, title = "Evaluation", noshow = True) """mean_rewards_one_episode = np.mean(all_rewards, axis = 0) fig = plt.figure(figsize=(10,5)) plt.plot(mean_rewards_one_episode) plt.xlabel("Step") plt.ylabel("Episode Reward ") plt.title("Mean of all episodes") fig.savefig('reward_mean_episode.png') plt.show(fig)"""
observation = env.reset() while not done and i < len_episode: first = True if i != 0: first = False sess.graph.clear_collection("theta_sff") loss = [] i += 1 old_observation = observation action = np.take(actor.predict(sess, observation), [0]) env.render() observation, reward, done, info = env.step([action]) buffer.add_transition(old_observation, action, observation, reward, done) s, a, ns, r, d = buffer.next_batch(batch_size) pred_actions = actor.predict(sess, ns) q_values = critic.predict(sess, ns, pred_actions) r = np.reshape(r, [-1, 1]) y = q_values - r g_r += reward g_stat.append(int(np.round(g_r)))
state = env.reset() state = state.reshape((1,3)) #state = np.array((state[0],state[2])) #state = np.round(state,2) episode_reward = 0 if (ep+1) % TEST_PROGRESS == 0 and ep != 0: list_avg_reward[idx][run].append(test_run(agent)) if ep < 1: print("filling memory") for i in range(MEMORY_FILL): # while len(memory) < MEMORY_FILL and ep < 1: action = agent.act(state, True)[0] next_state, reward, done , _ = env.step(agent.discrete_actions[action], False) next_state = next_state.reshape((1,3)) #next_state = np.round(next_state,2) memory.append((state, action, reward, next_state, done)) #agent.memory_store(state, action, reward, next_state, done) #if len(memory) > BATCH_SIZE[run]: #batch = random.sample(memory, BATCH_SIZE[run]) #agent.train(batch,memory) state = next_state # if ep < 1: print("memory filled with {} samples".format(len(memory)))
class QLearning(): def __init__(self): self.env = PendulumEnv() self.env.reset() self.action_number = 101 self.state_space = 2 self.epsilon = 0.2 self.bias = 0 self.gamma = 0.99 min_torque = -self.env.max_torque max_torque = self.env.max_torque self.actions = [] for i in range(self.action_number): self.actions.append(min_torque + (max_torque - min_torque) / (self.action_number - 1) * i) print(self.actions) self.weight_matrix = np.zeros((len(self.actions), self.state_space)) def train(self, episodes=10, max_iterarions=20000, learning_rate=0.01): for episodes_number in range(episodes): total_reward = 0 curr_state_info = self.env.reset() curr_state = np.zeros(self.state_space) for i in range(self.state_space): curr_state[i] = curr_state_info[i] #print (curr_state) for iteration_number in range(max_iterarions): time.sleep(0.1) print(curr_state) random_float = random.random() if (random_float < self.epsilon): action = np.random.randint(0, self.action_number - 1) else: action = -1 max_q_s_a_w = -sys.float_info.max for action_iter in range(self.action_number): q_s_a_w_iter = np.dot( curr_state, self.weight_matrix[action_iter]) + self.bias if (q_s_a_w_iter > max_q_s_a_w): max_q_s_a_w = q_s_a_w_iter action = action_iter u = np.array([self.actions[action] ]).astype(self.env.action_space.dtype) #u = np.array([500]).astype(self.env.action_space.dtype) next_state_info, reward, isDone, _ = self.env.step(u) print("reward : ", reward) print(self.actions[action]) print("") next_state = np.zeros((self.state_space)) for i in range(self.state_space): next_state[i] = float(next_state_info[i]) max_q_s_a_w = -sys.float_info.max for action_iter in range(self.action_number): q_s_a_w_iter = np.dot( next_state, self.weight_matrix[action_iter]) + self.bias if (q_s_a_w_iter > max_q_s_a_w): max_q_s_a_w = q_s_a_w_iter gradient_matrix_w = np.zeros( (self.action_number, self.state_space)) gradient_matrix_w[action] = curr_state copy_of_weight_matrix = copy.deepcopy(self.weight_matrix) copy_of_bias = copy.deepcopy(self.bias) self.weight_matrix = self.weight_matrix - learning_rate * ( (np.dot(curr_state, copy_of_weight_matrix[action]) + copy_of_bias) - (reward + self.gamma * max_q_s_a_w)) * gradient_matrix_w self.bias = self.bias - learning_rate * ( (np.dot(curr_state, copy_of_weight_matrix[action]) + copy_of_bias) - (reward + self.gamma * max_q_s_a_w)) * 1.0 curr_state = next_state total_reward += reward if (True or episodes_number % 100 == 0): self.env.render() self.env.close()
class ising: # Initialize the network def __init__(self, netsize, Nsensors=1, Nmotors=1): # Create ising model self.size = netsize #Network size self.Ssize = Nsensors # Number of sensors self.Msize = Nmotors # Number of sensors self.h = np.zeros(netsize) self.J = np.zeros((netsize, netsize)) self.max_weights = 2 self.randomize_state() self.env = PendulumEnv() self.observation = self.env.reset() self.Beta = 1.0 self.defaultT = max(100, netsize * 20) self.Ssize1 = 0 self.maxspeed = self.env.max_speed self.Update(-1) def get_state(self, mode='all'): if mode == 'all': return self.s elif mode == 'motors': return self.s[-self.Msize:] elif mode == 'sensors': return self.s[0:self.Ssize] elif mode == 'input': return self.sensors elif mode == 'non-sensors': return self.s[self.Ssize:] elif mode == 'hidden': return self.s[self.Ssize:-self.Msize] def get_state_index(self, mode='all'): return bool2int(0.5 * (self.get_state(mode) + 1)) # Randomize the state of the network def randomize_state(self): self.s = np.random.randint(0, 2, self.size) * 2 - 1 self.sensors = np.random.randint(0, 2, self.Ssize) * 2 - 1 # Randomize the position of the agent def randomize_position(self): self.observation = self.env.reset() # Set random bias to sets of units of the system def random_fields(self, max_weights=None): if max_weights is None: max_weights = self.max_weights self.h[self.Ssize:] = max_weights * \ (np.random.rand(self.size - self.Ssize) * 2 - 1) # Set random connections to sets of units of the system def random_wiring(self, max_weights=None): # Set random values for h and J if max_weights is None: max_weights = self.max_weights for i in range(self.size): for j in np.arange(i + 1, self.size): if i < j and (i >= self.Ssize or j >= self.Ssize): self.J[i, j] = (np.random.rand(1) * 2 - 1) * self.max_weights # Update the position of the agent def Move(self): self.previous_speed = self.observation[1] action = np.sum(self.s[-self.Msize:]) / self.Msize observation, reward, done, info = self.env.step(action) self.observation = self.env.state self.positionX = np.cos(self.env.state[0]) * self.env.l self.positionY = np.sin(self.env.state[0]) * self.env.l self.speed = self.env.state[1] # Transorm the sensor input into integer index def SensorIndex(self, x, xmax): return int( np.floor((x + xmax) / (2 * xmax + 10 * np.finfo(float).eps) * 2**self.Ssize)) # Update the state of the sensor def UpdateSensors(self): self.speed_ind = self.SensorIndex(self.speed, self.maxspeed) self.sensors = 2 * bitfield(self.speed_ind, self.Ssize) - 1 # Execute step of the Glauber algorithm to update the state of one unit def GlauberStep(self, i=None): if i is None: i = np.random.randint(self.size) I = 0 if i < self.Ssize: I = self.sensors[i] eDiff = 2 * self.s[i] * (self.h[i] + I + np.dot(self.J[i, :] + self.J[:, i], self.s)) if eDiff * self.Beta < np.log(1 / np.random.rand() - 1): # Glauber self.s[i] = -self.s[i] # Update random unit of the agent def Update(self, i=None): if i is None: i = np.random.randint(-1, self.size) if i == -1: self.Move() self.UpdateSensors() else: self.GlauberStep(i) # Sequentially update state of all units of the agent in random order def SequentialUpdate(self): for i in np.random.permutation(range(-1, self.size)): self.Update(i) # Step of the learning algorith to ajust correlations to the critical regime def AdjustCorrelations(self, T=None): if T is None: T = self.defaultT self.m = np.zeros(self.size) self.c = np.zeros((self.size, self.size)) self.C = np.zeros((self.size, self.size)) # Main simulation loop: samples = [] for t in range(T): self.SequentialUpdate() #self.x[t] = self.position self.m += self.s for i in range(self.size): self.c[i, i + 1:] += self.s[i] * self.s[i + 1:] self.m /= T self.c /= T for i in range(self.size): self.C[i, i + 1:] = self.c[i, i + 1:] - self.m[i] * self.m[i + 1:] c1 = np.zeros((self.size, self.size)) for i in range(self.size): inds = np.array([], int) c = np.array([]) for j in range(self.size): if not i == j: inds = np.append(inds, [j]) if i < j: c = np.append(c, [self.c[i, j]]) elif i > j: c = np.append(c, [self.c[j, i]]) order = np.argsort(c)[::-1] c1[i, inds[order]] = self.Cint[i, :] self.c1 = np.triu(c1 + c1.T, 1) self.c1 *= 0.5 self.m[0:self.Ssize] = 0 self.m1[0:self.Ssize] = 0 self.c[0:self.Ssize, 0:self.Ssize] = 0 self.c[-self.Msize:, -self.Msize:] = 0 self.c[0:self.Ssize, -self.Msize:] = 0 self.c1[0:self.Ssize, 0:self.Ssize] = 0 self.c1[-self.Msize:, -self.Msize:] = 0 self.c1[0:self.Ssize, -self.Msize:] = 0 dh = self.m1 - self.m dJ = self.c1 - self.c #print(self.m1,self.m) #print("dh, dJ:",dh,",",dJ) return dh, dJ # Algorithm for poising an agent in a critical regime def CriticalLearning(self, Iterations, T=None): u = 0.01 count = 0 dh, dJ = self.AdjustCorrelations(T) fit = max(np.max(np.abs(self.c1 - self.c)), np.max(np.abs(self.m1 - self.m))) for it in range(Iterations): count += 1 self.h += u * dh self.J += u * dJ if it % 10 == 0: self.randomize_state() self.randomize_position() Vmax = self.max_weights for i in range(self.size): if np.abs(self.h[i]) > Vmax: self.h[i] = Vmax * np.sign(self.h[i]) for j in np.arange(i + 1, self.size): if np.abs(self.J[i, j]) > Vmax: self.J[i, j] = Vmax * np.sign(self.J[i, j]) dh, dJ = self.AdjustCorrelations(T) fit = np.max(np.abs(self.c1 - self.c))
class Simulator(): def __init__(self, data_dictionary=None, policy_name=None, policy_directory=None): self.save_directory = 'rory_data' self.trainer = QLearning() self.data = data_dictionary self.num_actions = self.trainer.num_avail_actions self.num_positions = self.trainer.num_avail_positions self.num_velocities = self.trainer.num_avail_velocities if data_dictionary is None: if policy_directory is None: self.load_directory = self.trainer.save_directory else: self.load_directory = policy_directory if policy_name is None: self.policy_name = self.grab_newest_policy() else: self.policy_name = policy_name # goal_theta_num, goal_theta_den = self.get_goal_theta(self.policy_name) # self.goal_theta = goal_theta_num / goal_theta_den self.goal_theta = np.pi / 4 self.file = os.path.join(self.load_directory, self.policy_name) self.policy = self.load_policy() self.data = dict() else: self.goal_theta = self.data['goal_theta'] self.policy = self.data['policy'] self.file = self.data['fname'] self.policy_name = self.file self.env = PendulumEnv(goal_theta=self.goal_theta) self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_positions) self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_velocities) self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_actions) self.dummy_q = self.trainer.q_matrix self.num_useful_actions = 0 self.torques = [] self.theta_errors = [] def load_policy(self): policy = np.load(self.file, allow_pickle=True) policy = policy.item().get('policy') return policy def grab_newest_policy(self): all_policies = os.listdir(self.load_directory) file_count = 0 theta_dict = {} name_dict = {} for i, policy in enumerate(all_policies): if not policy.startswith('.'): # ignore .DS files, its a Mac thing fname, _ = os.path.splitext(policy) try: gtheta = '_'.join(fname.split('_')[-2:]) fname = '_'.join(fname.split('_')[:-2]) time_components = np.array(list(map(int, fname.split('_')))) file_count += 1 except ValueError: continue theta_dict[i] = gtheta if file_count == 1: name_array = time_components else: name_array = np.row_stack((name_array, time_components)) name_dict[fname] = i while len(name_array.shape) > 1: col_idx_diff = np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0] row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze() name_array = name_array[row_idx_curr_max, :] newest_policy = name_array newest_policy = '_'.join(map(str, newest_policy)) suffix_theta = theta_dict[name_dict[newest_policy]] newest_policy += '_' + suffix_theta + '.npy' return newest_policy def get_goal_theta(self,pol_name): # same exact logic as get_newest_policy() except it just grabs the goal theta fname, _ = os.path.splitext(pol_name) len_fname = len(fname) - 1 if 'pi' in fname: idx_pi = fname.find('pi') if idx_pi != len_fname: # index of numerator num = np.pi else: # index of denominator den = np.pi fname = fname.replace('pi','555') if 'ip' in fname: idx_ip = fname.find('ip') if idx_ip != len_fname: # index of numerator num = -np.pi else: # index of denominator den = -np.pi fname = fname.replace('ip','555') time_components = np.array(list(map(int, fname.split('_')))) name_array = time_components while len(name_array.shape) > 1: col_idx_diff = np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0] row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze() name_array = name_array[row_idx_curr_max, :] newest_policy = name_array temp_num = newest_policy[-2] temp_den = newest_policy[-1] if temp_num != 555: num = temp_num if temp_den != 555: den = temp_den return num, den def getQMatrixIdx(self, th, thdot, torque): thIdx = np.abs(self.thetas - th).argmin() thdotIdx = np.abs(self.theta_dots - thdot).argmin() torIdx = np.abs(self.actions - torque).argmin() return torIdx, thIdx, thdotIdx def getMaxQValue(self, th, thdot): # returns the depth index for given th,thdot state where torque is highest maxQValIdx = self.policy[:, th, thdot].argmax() maxQVal = self.policy[maxQValIdx, th, thdot] return maxQValIdx, maxQVal def get_action(self, th, thdot): _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0) chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx) action = self.actions[chosen_idx] u = np.array([action]).astype(self.env.action_space.dtype) return u def save_precious_simulated_data(self): self.data["simulated_episodes"] = self.num_episodes self.data["simulated_iterations"] = self.num_iterations self.data["avg_sim_cost"] = self.avg_cost self.data["perc_useful_actions"] = self.num_useful_actions self.data["torque_arr"] = self.torques self.data["theta_error_arr"] = self.theta_errors fname = self.policy_name save_path = os.path.join(self.save_directory, fname) if not os.path.exists(self.save_directory): os.mkdir(self.save_directory) np.save(save_path, self.data) print(f'saved precious data: {fname}') def update_dummy_q(self,torI,thI,thDotI): if self.dummy_q[torI,thI,thDotI] != 1000: self.dummy_q[torI,thI,thDotI] = 1000 def simulate(self, ep_num=500, iter_num=150, start_pos=None, start_vel=None): print(f'Running simulation using policy: {self.file}') self.num_episodes = ep_num self.num_iterations = iter_num total_total_cost = 0 try: for i in range(self.num_episodes): th, thdot = self.env.reset() if start_pos is not None: self.env.state[0] = start_pos th = start_pos if start_vel is not None: self.env.state[1] = start_vel thdot = start_vel for _ in range(self.num_iterations): self.env.render() self.theta_errors.append(self.goal_theta - th) u = self.get_action(th, thdot) self.torques.append(u) torIdx,thIdx,thDotIdx = self.getQMatrixIdx(th, thdot, u) self.update_dummy_q(torIdx,thIdx,thDotIdx) nextTh, nextThdot, reward = self.env.step(u) total_total_cost += reward th = nextTh thdot = nextThdot time.sleep(.05) if i != self.num_episodes-1: self.torques = [] self.theta_errors = [] except KeyboardInterrupt: pass self.avg_cost = total_total_cost / self.num_episodes self.num_useful_actions = (self.dummy_q == 1000).sum() / self.dummy_q.size self.env.close()