Example #1
0
def render_test(torque_type=0):
    '''
    torque_types:
        - 0 : square wave torque to make the pendulum oscillate back and forth
        - 1 : some constant value torque
        - 2 (or anything else) : random torque
    '''

    env = PendulumEnv()
    env.reset()

    at_rest = True

    try:
        for _ in range(500):
            env.render()

            if torque_type == 0:

                if env.state[0] == env.angle_limit and at_rest:
                    val = env.max_torque
                    at_rest = False

                elif env.state[0] == -env.angle_limit and at_rest:
                    val = -env.max_torque
                    at_rest = False

                if abs(env.state[0]) == env.angle_limit and not at_rest:
                    at_rest = True

                u = np.array([val]).astype(env.action_space.dtype)

            elif torque_type == 1:
                val = -43
                u = np.array([val]).astype(env.action_space.dtype)

            else:
                u = env.action_space.sample()

            info = env.step(u)
            # print(info)
            # print(env.state[0]) # print angular position
            print(env.state[1])  # print angular velocity

            time.sleep(.1)

    except KeyboardInterrupt:
        pass

    env.close()
Example #2
0
    return K

# create environment
env = PendulumEnv()

# reset environment
state = env.reset()
done = False

# Get linearized dynamics
A,B = env._linearize(np.pi)

# create cost matrices for LQR
Q = np.array([[1,0], [0,10]])
R = np.array([[0.001]])

# Compute gain matrix K
K = lqr(A,B,Q,R)

# Run environment
i = 0
while not done:
    env.render()
    if i >= 1:
        action = -np.matmul(K, state.reshape(2) - np.array([np.pi, 0]))
    else:
        action = -np.matmul(K, state - np.array([np.pi, 0]))
    state, _, done, _ = env.step(action)
    i += 1
env.close()
class QLearning():

    def __init__(self, goal_theta_num=0, goal_theta_den=1):
        self.goal_theta_num = goal_theta_num
        self.goal_theta_den = goal_theta_den
        self.goal_theta = goal_theta_num / goal_theta_den

        self.env = PendulumEnv(goal_theta=self.goal_theta)

        self.save_directory = 'saved_policies'

        self.epsilon = .2
        self.gamma = .99

        self.num_avail_actions = 31
        self.num_avail_positions = 51
        self.num_avail_velocities = 51

        self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_avail_positions)
        self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_avail_velocities)
        self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_avail_actions)

        self.q_matrix = np.zeros((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        ))

        self.converge_threshold = 0.05 # % of q-value that dq must be close to to be considered converged
        self.perc_conv_thresh = 0.8 # % of q-values that must pass the convergence threshold

        self.percent_converged = 0
        self.percent_unexplored = 0

        self.prev_q_matrix = np.zeros((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        )) # previous q-matrix

        self.dq_matrix = 100 * np.ones((
            self.num_avail_actions,
            self.num_avail_positions,
            self.num_avail_velocities
        )) # delta-q matrix, tracks amount each weight is being updated

        self.data = dict()
        self.start_time = 0
        self.total_time = 0

        self.ep_rewards = []
        self.perc_unexplored_arr = []
        self.perc_conv_arr = []
    

    def getQMatrixIdx(self, th, thdot, torque):
        thIdx = np.abs(self.thetas - th).argmin()
        thdotIdx = np.abs(self.theta_dots - thdot).argmin()
        torIdx = np.abs(self.actions - torque).argmin()

        return torIdx, thIdx, thdotIdx


    def getMaxQValue(self, th, thdot):
        # returns the depth index for given th,thdot state where torque is highest
        maxQValIdx = self.q_matrix[:, th, thdot].argmax()
        maxQVal = self.q_matrix[maxQValIdx, th, thdot]

        return maxQValIdx, maxQVal


    def get_action(self, th, thdot):

        random_float = random.random()
        if (random_float < self.epsilon): # if a random float is less than our epsilon, explore a random action
            chosen_idx = np.random.randint(0, self.num_avail_actions)

        else: # if the random float is not less than epsilon, exploit the policy
            _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0)
            chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx)

        action = self.actions[chosen_idx]
        u = np.array([action]).astype(self.env.action_space.dtype)

        return u
    

    def check_converged(self):
        # total number of elements
        total_elements = self.q_matrix.size
        
        percent_changed = np.divide(self.dq_matrix, self.q_matrix,
                        out=np.ones(self.q_matrix.shape),
                        where=self.q_matrix != 0)

        # compute the number of 'converged' q-values
        num_converged = (percent_changed < self.converge_threshold).sum()
        # percentage of converged q-values
        self.percent_converged = num_converged / total_elements

        self.percent_unexplored = (self.dq_matrix == 100).sum() / self.dq_matrix.size

        if self.percent_converged >= self.perc_conv_thresh:
            return True

        else:
            return False


    def get_convergence_stats(self):
        # returns a dictionary where the keys are threshold values (from [0-0.95] in 0.05 step size)
        # and the values are the percentage of the q-matrix that meets this threshold of convergence
        conv_stats = dict()
        total_elements = self.q_matrix.size
        conv_threshold = 0.05

        for i in range(20):
            new_threshold = (conv_threshold * i)
            percent_changed = np.divide(self.dq_matrix, self.q_matrix,
                            out=np.ones(self.q_matrix.shape),
                            where=self.q_matrix != 0)
            # compute the number of 'converged' q-values
            num_converged = (percent_changed < new_threshold).sum()
            # percentage of converged q-values
            percent_converged = num_converged / total_elements

            conv_stats[new_threshold] = percent_converged

        return conv_stats


    def train(self, episodes=15000, max_iterations=100000, l_rate=0.1):
        self.start_time = time.time()

        for episode_num in range(episodes):

            self.episodes = episode_num
            self.iterations = max_iterations
            self.l_rate = l_rate
            
            # reset the environment and declare th,thdot
            th, thdot = self.env.reset()

            iter_count = -1
            total_reward = 0

            while(not self.env.is_done and iter_count < max_iterations):
                iter_count += 1

                # select a new action to take
                u = self.get_action(th, thdot)

                # find the current indecies in the self.weights_matrix so that we can update the weight for this action : th,thdot,u
                currTorIdx, currThIdx, currThdotIdx = self.getQMatrixIdx(th, thdot, u)

                # find next state corresponding to chosen action
                nextTh, nextThdot, reward = self.env.step(u)
                total_reward += reward

                _, nextThIdx, nextThdotIdx = self.getQMatrixIdx(nextTh, nextThdot, u)

                # find the highest weighted torque in the self.weights_matrix given the nextTh,nextThdot
                _, nextQVal = self.getMaxQValue(nextThIdx, nextThdotIdx)

                self.q_matrix[currTorIdx, currThIdx, currThdotIdx] += l_rate * (reward + self.gamma * nextQVal \
                                                                    - self.q_matrix[currTorIdx, currThIdx, currThdotIdx])

                self.dq_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx] \
                                                                    - self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx]

                self.prev_q_matrix[currTorIdx, currThIdx, currThdotIdx] = self.q_matrix[currTorIdx, currThIdx, currThdotIdx]

                th = nextTh
                thdot = nextThdot
                    
                if iter_count % 100 == 0:
                    # self.env.render()
                    print('iter_count = ', iter_count)
                    print('episode = ', episode_num)
                    print('epsilon = ', self.epsilon)
                    print(f'percent converged = {round(self.percent_converged, 2)}')
                    print(f'percent unexplored = {round(self.percent_unexplored, 2)}')
                    print("Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time)))
                    print('')
                    
            converged = self.check_converged()
            
            if converged:
                print(f'Converged on episode {episode_num}')
                break

            if episode_num % 10 == 0:
                self.ep_rewards.append(total_reward)
                self.perc_unexplored_arr.append(self.percent_unexplored)
                self.perc_conv_arr.append(self.percent_converged)

        
        self.print_stuff()
        self.save_policy()
        self.get_precious_data()

    
    def increase_epsilon_maybe(self, ep_num):
        if ep_num % 1000 == 0 and ep_num != 0:
            self.epsilon -= .006

    
    def save_policy(self):
        time_struct = time.localtime(time.time())
        fname = self.get_fname(time_struct)
        self.data['fname'] = fname
        save_path = os.path.join(self.save_directory, fname)

        np.save(save_path, self.q_matrix)

        print(f'saved policy: {fname}')


    def get_fname(self, time_params):
        year = time_params.tm_year
        month = time_params.tm_mon
        day = time_params.tm_mday
        hour = time_params.tm_hour
        minute = time_params.tm_min
        sec = time_params.tm_sec

        if isclose(self.goal_theta_num, np.pi, abs_tol=1e-7):
            num = 'pi'
        elif isclose(self.goal_theta_num, -np.pi, abs_tol=1e-7):
            num = 'ip'
        else:
            num = self.goal_theta_num

        if isclose(self.goal_theta_den, np.pi, abs_tol=1e-7):
            den = 'pi'
        elif isclose(self.goal_theta_den, -np.pi, abs_tol=1e-7):
            den = 'ip'
        else:
            den = self.goal_theta_den

        fname = f'{year}_{month}_{day}_{hour}_{minute}_{sec}_{num}_{den}'

        return fname
    

    def print_stuff(self):

        print("Training Done")
        print("Total Time Elapsed: ",time.strftime("%H:%M:%S",time.gmtime(time.time()-self.start_time)))
        print(f'percent converged = {round(self.percent_converged, 2)}')
        print(f'percent unexplored = {round(self.percent_unexplored, 2)}')


    def get_precious_data(self):
        self.data["gamma"] = self.gamma
        self.data["epsilon"] = self.epsilon
        self.data["goal_theta"] = self.goal_theta
        self.data["perc_actions_explored"] = 1 - self.percent_unexplored
        self.data["perc_converged"] = self.percent_converged
        self.data["conv_perc_req"] = self.perc_conv_thresh # % of q-values that must pass the convergence threshold
        self.data["converge_threshold"] = self.converge_threshold # % of q-value that dq must be close to to be considered converged
        self.data["training_time"] = self.total_time
        self.data["training_episodes"] = self.episodes
        self.data["training_iterations"] = self.iterations
        self.data["learning_rate"] = self.l_rate
        self.data["policy"] = self.q_matrix
        self.data["converged_stats"] = self.get_convergence_stats()
        self.data["ep_rewards_arr"] = self.ep_rewards
        self.data["perc_unexplored_arr"] = self.perc_unexplored_arr
        self.data["perc_converged_arr"] = self.perc_conv_arr
Example #4
0
        terminal[i] = self.terminal.take(index, axis=0, mode='wrap')
    return state, action, next_state, reward, terminal


for ep in range(TRAINING_EPISODES):
    if ep % AUTO_SAVER == 0 and nepisodes != 0:
        print_timestamp("saved")
        agent.save(weights_file)
    state = env.reset()
    state = np.array((state[0], state[2]))
    state = np.round(state, 2)
    for t in range(TIMESTEPS):
        env.render()

        action = random.choice(agent.discrete_actions)
        next_state, reward, done, _ = env.step(action, state[1])

        next_state = np.array((next_state[0], next_state[2]))
        next_state = np.round(next_state, 2)

        if len(memory) == MEMORY_SIZE:
            memory.pop(0)
        memory.append((state, action, reward, next_state, done))

        batch = random.choice(memory)

        # print("real angle: {}".format(next_state[0]))
        # print("real vel: {}".format(next_state[1]))
        # next_state[0], next_state[1] = discretizeStateSpace(np.round(next_state[0],2),np.round(next_state[1],2))

        # next_state = np.round(next_state,2)
Example #5
0
    state = env.reset()
    state = state.reshape((1, 3))
    episode_reward = 0

    if (ep + 1) % TEST_PROGRESS == 0 and ep != 0:
        avg_reward_list.append(test_run(agent))

    for step in range(steps):
        #if ep % SHOW_PROGRESS == 0 and ep != 0:
        if False:
            env.render()
            action = agent.act(state, False)
        else:
            action = agent.act(state, True)

        next_state, reward, done, _ = env.step(action_sequence[action])
        next_state = next_state.reshape((1, 3))

        agent.memory_store(state, action, reward, next_state, done)

        if (ep > 15):
            sample_index = np.random.choice(agent.memory_size,
                                            size=agent.batch_size)
            batch = agent.memory[sample_index, :]
            agent.train(batch)
        if agent.epsilon > agent.epsilon_min:
            agent.epsilon -= 0.00001

        state = next_state
        episode_reward += reward
Example #6
0
import sys
#sys.path.append('..')
parser = argparse.ArgumentParser()
import random
import datetime



env = PendulumEnv()
env.cnt += 1
input_shape = 3
batch_size = 64
test_agent = DankAgent([-env.max_torque,env.max_torque],input_shape,batch_size)
test_agent.load('network_vanilla.h5')

list_episode_reward = []

TESTING_EPISODES = 10

for ep in range(TESTING_EPISODES):
    state = env.reset()
    episode_reward = 0
    for t in range(500):
        env.render()

        action = test_agent.act(state, False)
        next_state, reward, done , _ = env.step(test_agent.discrete_actions[action])

        state = next_state
        episode_reward += reward
		print("Ended at : ", datetime.datetime.now().strftime("%H:%M:%S"))

	plot_episode_stats(stats, title = "Training", noshow = True)

	start_time = time.time()
	print("Starting at : ", datetime.datetime.now().strftime("%H:%M:%S"))

	evaluation = 15000
	evaluation_stats = EpisodeStats(episode_q_values=np.zeros(evaluation), episode_rewards=np.zeros(evaluation), episode_loss=np.zeros(evaluation))
	for e in range(evaluation):
		print("Evaluation phase ",e, " starts....")
		state = env.reset()
		for _ in range(max_time_per_episode):
		  #env.render()
		  action = actor.predict(sess, np.reshape(state, (1, 3)))
		  next_state, reward, _, _ = env.step(action)
		  state = next_state
		  evaluation_stats.episode_rewards[e] += reward
		  #rewards.append(reward)
		#all_rewards.append(rewards)

	plot_episode_stats(evaluation_stats, title = "Evaluation", noshow = True)

	"""mean_rewards_one_episode = np.mean(all_rewards, axis = 0)
		fig = plt.figure(figsize=(10,5))
		plt.plot(mean_rewards_one_episode)
		plt.xlabel("Step")
		plt.ylabel("Episode Reward ")
		plt.title("Mean of all episodes")
		fig.savefig('reward_mean_episode.png')
		plt.show(fig)"""
Example #8
0
                observation = env.reset()

                while not done and i < len_episode:

                    first = True
                    if i != 0:
                        first = False
                        sess.graph.clear_collection("theta_sff")
                    loss = []
                    i += 1
                    old_observation = observation
                    action = np.take(actor.predict(sess, observation), [0])

                    env.render()
                    observation, reward, done, info = env.step([action])

                    buffer.add_transition(old_observation, action, observation,
                                          reward, done)
                    s, a, ns, r, d = buffer.next_batch(batch_size)

                    pred_actions = actor.predict(sess, ns)

                    q_values = critic.predict(sess, ns, pred_actions)

                    r = np.reshape(r, [-1, 1])
                    y = q_values - r

                    g_r += reward
                    g_stat.append(int(np.round(g_r)))
Example #9
0
            state = env.reset()
            state = state.reshape((1,3))
            #state = np.array((state[0],state[2]))
            #state = np.round(state,2)
            episode_reward = 0
            if (ep+1) % TEST_PROGRESS == 0 and ep != 0:
                list_avg_reward[idx][run].append(test_run(agent))


            if ep < 1:
                print("filling memory")
                for i in range(MEMORY_FILL):
            # while len(memory) < MEMORY_FILL and ep < 1:
                    action = agent.act(state, True)[0]

                    next_state, reward, done , _ = env.step(agent.discrete_actions[action], False)
                    next_state = next_state.reshape((1,3))

                    #next_state = np.round(next_state,2)
                    memory.append((state, action, reward, next_state, done))
                    #agent.memory_store(state, action, reward, next_state, done)

                    #if len(memory) > BATCH_SIZE[run]:
                        #batch = random.sample(memory, BATCH_SIZE[run])
                        #agent.train(batch,memory)

                    state = next_state
            # if ep < 1:
                print("memory filled with {} samples".format(len(memory)))

Example #10
0
class QLearning():
    def __init__(self):
        self.env = PendulumEnv()
        self.env.reset()
        self.action_number = 101
        self.state_space = 2
        self.epsilon = 0.2
        self.bias = 0
        self.gamma = 0.99
        min_torque = -self.env.max_torque
        max_torque = self.env.max_torque
        self.actions = []
        for i in range(self.action_number):
            self.actions.append(min_torque + (max_torque - min_torque) /
                                (self.action_number - 1) * i)
        print(self.actions)
        self.weight_matrix = np.zeros((len(self.actions), self.state_space))

    def train(self, episodes=10, max_iterarions=20000, learning_rate=0.01):
        for episodes_number in range(episodes):
            total_reward = 0
            curr_state_info = self.env.reset()
            curr_state = np.zeros(self.state_space)
            for i in range(self.state_space):
                curr_state[i] = curr_state_info[i]

            #print (curr_state)
            for iteration_number in range(max_iterarions):
                time.sleep(0.1)
                print(curr_state)
                random_float = random.random()
                if (random_float < self.epsilon):
                    action = np.random.randint(0, self.action_number - 1)
                else:
                    action = -1
                    max_q_s_a_w = -sys.float_info.max
                    for action_iter in range(self.action_number):
                        q_s_a_w_iter = np.dot(
                            curr_state,
                            self.weight_matrix[action_iter]) + self.bias
                        if (q_s_a_w_iter > max_q_s_a_w):
                            max_q_s_a_w = q_s_a_w_iter
                            action = action_iter

                u = np.array([self.actions[action]
                              ]).astype(self.env.action_space.dtype)
                #u = np.array([500]).astype(self.env.action_space.dtype)

                next_state_info, reward, isDone, _ = self.env.step(u)
                print("reward : ", reward)
                print(self.actions[action])
                print("")

                next_state = np.zeros((self.state_space))
                for i in range(self.state_space):
                    next_state[i] = float(next_state_info[i])

                max_q_s_a_w = -sys.float_info.max
                for action_iter in range(self.action_number):
                    q_s_a_w_iter = np.dot(
                        next_state,
                        self.weight_matrix[action_iter]) + self.bias
                    if (q_s_a_w_iter > max_q_s_a_w):
                        max_q_s_a_w = q_s_a_w_iter

                gradient_matrix_w = np.zeros(
                    (self.action_number, self.state_space))
                gradient_matrix_w[action] = curr_state

                copy_of_weight_matrix = copy.deepcopy(self.weight_matrix)
                copy_of_bias = copy.deepcopy(self.bias)

                self.weight_matrix = self.weight_matrix - learning_rate * (
                    (np.dot(curr_state, copy_of_weight_matrix[action]) +
                     copy_of_bias) -
                    (reward + self.gamma * max_q_s_a_w)) * gradient_matrix_w
                self.bias = self.bias - learning_rate * (
                    (np.dot(curr_state, copy_of_weight_matrix[action]) +
                     copy_of_bias) - (reward + self.gamma * max_q_s_a_w)) * 1.0

                curr_state = next_state
                total_reward += reward

                if (True or episodes_number % 100 == 0):
                    self.env.render()

        self.env.close()
Example #11
0
class ising:
    # Initialize the network
    def __init__(self, netsize, Nsensors=1, Nmotors=1):  # Create ising model

        self.size = netsize  #Network size
        self.Ssize = Nsensors  # Number of sensors
        self.Msize = Nmotors  # Number of sensors

        self.h = np.zeros(netsize)
        self.J = np.zeros((netsize, netsize))
        self.max_weights = 2

        self.randomize_state()

        self.env = PendulumEnv()

        self.observation = self.env.reset()

        self.Beta = 1.0
        self.defaultT = max(100, netsize * 20)

        self.Ssize1 = 0
        self.maxspeed = self.env.max_speed
        self.Update(-1)

    def get_state(self, mode='all'):
        if mode == 'all':
            return self.s
        elif mode == 'motors':
            return self.s[-self.Msize:]
        elif mode == 'sensors':
            return self.s[0:self.Ssize]
        elif mode == 'input':
            return self.sensors
        elif mode == 'non-sensors':
            return self.s[self.Ssize:]
        elif mode == 'hidden':
            return self.s[self.Ssize:-self.Msize]

    def get_state_index(self, mode='all'):
        return bool2int(0.5 * (self.get_state(mode) + 1))

    # Randomize the state of the network
    def randomize_state(self):
        self.s = np.random.randint(0, 2, self.size) * 2 - 1
        self.sensors = np.random.randint(0, 2, self.Ssize) * 2 - 1

    # Randomize the position of the agent
    def randomize_position(self):
        self.observation = self.env.reset()

    # Set random bias to sets of units of the system
    def random_fields(self, max_weights=None):
        if max_weights is None:
            max_weights = self.max_weights
        self.h[self.Ssize:] = max_weights * \
         (np.random.rand(self.size - self.Ssize) * 2 - 1)

    # Set random connections to sets of units of the system
    def random_wiring(self, max_weights=None):  # Set random values for h and J
        if max_weights is None:
            max_weights = self.max_weights
        for i in range(self.size):
            for j in np.arange(i + 1, self.size):
                if i < j and (i >= self.Ssize or j >= self.Ssize):
                    self.J[i,
                           j] = (np.random.rand(1) * 2 - 1) * self.max_weights

    # Update the position of the agent
    def Move(self):
        self.previous_speed = self.observation[1]
        action = np.sum(self.s[-self.Msize:]) / self.Msize
        observation, reward, done, info = self.env.step(action)

        self.observation = self.env.state
        self.positionX = np.cos(self.env.state[0]) * self.env.l
        self.positionY = np.sin(self.env.state[0]) * self.env.l

        self.speed = self.env.state[1]

    # Transorm the sensor input into integer index
    def SensorIndex(self, x, xmax):
        return int(
            np.floor((x + xmax) / (2 * xmax + 10 * np.finfo(float).eps) *
                     2**self.Ssize))

    # Update the state of the sensor
    def UpdateSensors(self):
        self.speed_ind = self.SensorIndex(self.speed, self.maxspeed)
        self.sensors = 2 * bitfield(self.speed_ind, self.Ssize) - 1

    # Execute step of the Glauber algorithm to update the state of one unit
    def GlauberStep(self, i=None):
        if i is None:
            i = np.random.randint(self.size)

        I = 0
        if i < self.Ssize:
            I = self.sensors[i]
        eDiff = 2 * self.s[i] * (self.h[i] + I +
                                 np.dot(self.J[i, :] + self.J[:, i], self.s))
        if eDiff * self.Beta < np.log(1 / np.random.rand() - 1):  # Glauber
            self.s[i] = -self.s[i]

    # Update random unit of the agent
    def Update(self, i=None):
        if i is None:
            i = np.random.randint(-1, self.size)
        if i == -1:
            self.Move()
            self.UpdateSensors()
        else:
            self.GlauberStep(i)

    # Sequentially update state of all units of the agent in random order
    def SequentialUpdate(self):
        for i in np.random.permutation(range(-1, self.size)):
            self.Update(i)

    # Step of the learning algorith to ajust correlations to the critical regime
    def AdjustCorrelations(self, T=None):
        if T is None:
            T = self.defaultT

        self.m = np.zeros(self.size)
        self.c = np.zeros((self.size, self.size))
        self.C = np.zeros((self.size, self.size))

        # Main simulation loop:
        samples = []
        for t in range(T):

            self.SequentialUpdate()
            #self.x[t] = self.position
            self.m += self.s
            for i in range(self.size):
                self.c[i, i + 1:] += self.s[i] * self.s[i + 1:]
        self.m /= T
        self.c /= T
        for i in range(self.size):
            self.C[i, i + 1:] = self.c[i, i + 1:] - self.m[i] * self.m[i + 1:]

        c1 = np.zeros((self.size, self.size))
        for i in range(self.size):
            inds = np.array([], int)
            c = np.array([])
            for j in range(self.size):
                if not i == j:
                    inds = np.append(inds, [j])
                if i < j:
                    c = np.append(c, [self.c[i, j]])
                elif i > j:
                    c = np.append(c, [self.c[j, i]])
            order = np.argsort(c)[::-1]
            c1[i, inds[order]] = self.Cint[i, :]
        self.c1 = np.triu(c1 + c1.T, 1)
        self.c1 *= 0.5

        self.m[0:self.Ssize] = 0
        self.m1[0:self.Ssize] = 0
        self.c[0:self.Ssize, 0:self.Ssize] = 0
        self.c[-self.Msize:, -self.Msize:] = 0
        self.c[0:self.Ssize, -self.Msize:] = 0
        self.c1[0:self.Ssize, 0:self.Ssize] = 0
        self.c1[-self.Msize:, -self.Msize:] = 0
        self.c1[0:self.Ssize, -self.Msize:] = 0
        dh = self.m1 - self.m
        dJ = self.c1 - self.c
        #print(self.m1,self.m)
        #print("dh, dJ:",dh,",",dJ)
        return dh, dJ

    # Algorithm for poising an agent in a critical regime
    def CriticalLearning(self, Iterations, T=None):
        u = 0.01
        count = 0
        dh, dJ = self.AdjustCorrelations(T)
        fit = max(np.max(np.abs(self.c1 - self.c)),
                  np.max(np.abs(self.m1 - self.m)))
        for it in range(Iterations):
            count += 1
            self.h += u * dh
            self.J += u * dJ
            if it % 10 == 0:
                self.randomize_state()
                self.randomize_position()
            Vmax = self.max_weights
            for i in range(self.size):
                if np.abs(self.h[i]) > Vmax:
                    self.h[i] = Vmax * np.sign(self.h[i])
                for j in np.arange(i + 1, self.size):
                    if np.abs(self.J[i, j]) > Vmax:
                        self.J[i, j] = Vmax * np.sign(self.J[i, j])

            dh, dJ = self.AdjustCorrelations(T)
            fit = np.max(np.abs(self.c1 - self.c))
class Simulator():

    def __init__(self, data_dictionary=None, policy_name=None, policy_directory=None):

        self.save_directory = 'rory_data'

        self.trainer = QLearning()
        self.data = data_dictionary

        self.num_actions = self.trainer.num_avail_actions
        self.num_positions = self.trainer.num_avail_positions
        self.num_velocities = self.trainer.num_avail_velocities

        if data_dictionary is None:
            if policy_directory is None:
                self.load_directory = self.trainer.save_directory
            else:
                self.load_directory = policy_directory

            if policy_name is None:
                self.policy_name = self.grab_newest_policy()
            else:
                self.policy_name = policy_name

            # goal_theta_num, goal_theta_den = self.get_goal_theta(self.policy_name)

            # self.goal_theta = goal_theta_num / goal_theta_den
            self.goal_theta = np.pi / 4

            self.file = os.path.join(self.load_directory, self.policy_name)
            self.policy = self.load_policy()
            self.data = dict()
            

        else:
            self.goal_theta = self.data['goal_theta']
            self.policy = self.data['policy']
            self.file = self.data['fname']
            self.policy_name = self.file

        self.env = PendulumEnv(goal_theta=self.goal_theta)

        self.thetas = np.linspace(-self.env.angle_limit, self.env.angle_limit, self.num_positions)
        self.theta_dots = np.linspace(-self.env.max_speed, self.env.max_speed, self.num_velocities)
        self.actions = np.linspace(-self.env.max_torque, self.env.max_torque, self.num_actions)

        self.dummy_q = self.trainer.q_matrix
        self.num_useful_actions = 0

        self.torques = []
        self.theta_errors = []

    
    def load_policy(self):
        policy = np.load(self.file, allow_pickle=True)
        policy = policy.item().get('policy')
        return policy

    
    def grab_newest_policy(self):
        all_policies = os.listdir(self.load_directory)

        file_count = 0
        theta_dict = {}
        name_dict = {}

        for i, policy in enumerate(all_policies):

            if not policy.startswith('.'): # ignore .DS files, its a Mac thing
                fname, _ = os.path.splitext(policy)
                
                try:
                    gtheta = '_'.join(fname.split('_')[-2:])
                    fname = '_'.join(fname.split('_')[:-2])
                    time_components = np.array(list(map(int, fname.split('_'))))
                    file_count += 1
                except ValueError:
                    continue

                theta_dict[i] = gtheta

                if file_count == 1:
                    name_array = time_components
                else:
                    name_array = np.row_stack((name_array, time_components))

                name_dict[fname] = i

        while len(name_array.shape) > 1:
            col_idx_diff =  np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0]
            row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze()
            name_array = name_array[row_idx_curr_max, :]
        
        newest_policy = name_array
        newest_policy = '_'.join(map(str, newest_policy))
        suffix_theta = theta_dict[name_dict[newest_policy]]
        newest_policy += '_' + suffix_theta + '.npy'
            
        return newest_policy
    

    def get_goal_theta(self,pol_name):
        # same exact logic as get_newest_policy() except it just grabs the goal theta
        fname, _ = os.path.splitext(pol_name)
        len_fname = len(fname) - 1

        if 'pi' in fname:
            idx_pi = fname.find('pi')
            
            if idx_pi != len_fname: # index of numerator
                num = np.pi
            else: # index of denominator
                den = np.pi

            fname = fname.replace('pi','555')
        
        if 'ip' in fname:
            idx_ip = fname.find('ip')

            if idx_ip != len_fname: # index of numerator
                num = -np.pi
            else: # index of denominator
                den = -np.pi

            fname = fname.replace('ip','555')
                       
        time_components = np.array(list(map(int, fname.split('_'))))
        name_array = time_components

        while len(name_array.shape) > 1:
            col_idx_diff =  np.any(name_array != name_array[0,:], axis = 0).nonzero()[0][0]
            row_idx_curr_max = np.argwhere(name_array[:, col_idx_diff] == np.amax(name_array[:, col_idx_diff])).squeeze()
            name_array = name_array[row_idx_curr_max, :]
        
        newest_policy = name_array
        temp_num = newest_policy[-2]
        temp_den = newest_policy[-1]

        if temp_num != 555:
            num = temp_num
        if temp_den != 555:
            den = temp_den

        return num, den


    def getQMatrixIdx(self, th, thdot, torque):
        thIdx = np.abs(self.thetas - th).argmin()
        thdotIdx = np.abs(self.theta_dots - thdot).argmin()
        torIdx = np.abs(self.actions - torque).argmin()

        return torIdx, thIdx, thdotIdx


    def getMaxQValue(self, th, thdot):
        # returns the depth index for given th,thdot state where torque is highest
        maxQValIdx = self.policy[:, th, thdot].argmax()
        maxQVal = self.policy[maxQValIdx, th, thdot]

        return maxQValIdx, maxQVal


    def get_action(self, th, thdot):
        _, thIdx, thdotIdx = self.getQMatrixIdx(th, thdot, 0)
        chosen_idx, _ = self.getMaxQValue(thIdx, thdotIdx)

        action = self.actions[chosen_idx]
        u = np.array([action]).astype(self.env.action_space.dtype)

        return u

    
    def save_precious_simulated_data(self):
        self.data["simulated_episodes"] = self.num_episodes
        self.data["simulated_iterations"] = self.num_iterations
        self.data["avg_sim_cost"] = self.avg_cost
        self.data["perc_useful_actions"] = self.num_useful_actions
        self.data["torque_arr"] = self.torques
        self.data["theta_error_arr"] = self.theta_errors

        fname = self.policy_name

        save_path = os.path.join(self.save_directory, fname)

        if not os.path.exists(self.save_directory):
            os.mkdir(self.save_directory)

        np.save(save_path, self.data)

        print(f'saved precious data: {fname}')


    def update_dummy_q(self,torI,thI,thDotI):
        if self.dummy_q[torI,thI,thDotI] != 1000:
            self.dummy_q[torI,thI,thDotI] = 1000

    
    def simulate(self, ep_num=500, iter_num=150, start_pos=None, start_vel=None):
        print(f'Running simulation using policy: {self.file}')

        self.num_episodes = ep_num
        self.num_iterations = iter_num
        total_total_cost = 0

        try:

            for i in range(self.num_episodes):

                th, thdot = self.env.reset()

                if start_pos is not None:
                    self.env.state[0] = start_pos
                    th = start_pos
                
                if start_vel is not None:
                    self.env.state[1] = start_vel
                    thdot = start_vel

                for _ in range(self.num_iterations):

                    self.env.render()

                    self.theta_errors.append(self.goal_theta - th)

                    u = self.get_action(th, thdot)
                    self.torques.append(u)

                    torIdx,thIdx,thDotIdx = self.getQMatrixIdx(th, thdot, u)

                    self.update_dummy_q(torIdx,thIdx,thDotIdx)

                    nextTh, nextThdot, reward = self.env.step(u)
                    total_total_cost += reward

                    th = nextTh
                    thdot = nextThdot
                
                    time.sleep(.05)

                if i != self.num_episodes-1:
                    self.torques = []
                    self.theta_errors = []
        
        except KeyboardInterrupt:
            pass

        self.avg_cost = total_total_cost / self.num_episodes
        self.num_useful_actions = (self.dummy_q == 1000).sum() / self.dummy_q.size
        
        self.env.close()