def run_sim(self, p):

        print ("Parameters: ")
        for k, v in p.items():
            if k[0:2] == "__":
                continue
            print (str(k) + " : " + str(v))
        del k
        del v

        # init random number generator from seed
        np.random.seed(p["random_seed"])

        # initialize hyperparameters fresh, unless we are resuming a saved simulation
        # in which case, we load the parameters
        if not p.has_key("load_name"):
            self.init_sim(p)
        else:
            self.load_sim(p)

        # initialize environment
        self.sim = cartpole_environment()
        self.vel_bound = p["vel_bound"]
        self.pos_bound = p["pos_bound"]
        self.angle_vel_bound = p["angle_vel_bound"]
        self.sim.init(
            self.vel_bound,
            self.angle_vel_bound,
            self.pos_bound,
            p["g"],
            p["l"],
            p["mp"],
            p["mc"],
            p["dt"],
            p["negative_reward"],
            p["positive_reward"],
            p["no_reward"],
            p.get("reward_type", 0),
        )

        self.do_vis = p["do_vis"]
        self.save_images = p.get("save_images", False)
        self.image_save_dir = p.get("image_save_dir", None)
        save_interval = p["save_interval"]
        self.do_running_printout = p.get("do_running_printout", False)

        self.showevery = p["showevery"]
        self.fastforwardskip = 5
        push_force = p["push_force"]

        self.reward_type = p.get("reward_type", 0)

        self.use_full_output = p.get("use_full_output", False)

        self.earlyendepisode = np.zeros(10)
        self.earlyendreward = np.zeros(10)
        for i in range(9):
            self.earlyendepisode[i] = p.get("earlyendepisode" + str(i), 0)
            self.earlyendreward[i] = p.get("earlyendreward" + str(i), 0)

        self.do_recurrence = p.get("do_recurrence", False)

        if self.do_vis:
            # only import if we need it, since we don't want to require installation of pygame
            from cartpole.vis.visualize_sdl import visualize_sdl

            v = visualize_sdl()
            v.init_vis(
                p["display_width"],
                p["display_height"],
                p["axis_x_min"],
                p["axis_x_max"],
                p["axis_y_min"],
                p["axis_y_max"],
                p["fps"],
            )

        print_update_timer = time.time()
        self.start_time = time.time()
        elapsed_time = time.time()
        step_duration_timer = time.time()
        save_time = time.time()
        self.avg_step_duration = 1.0

        ##repeat for each episode
        self.r_sum_avg = -0.95
        self.r_sum_avg_list = []
        self.steps_balancing_pole_list = []
        self.steps_balancing_pole_avg = 0.00
        self.steps_balancing_pole_avg_list = []

        while 1:
            # reset eligibility at the beginning of each episode
            # TODO: This should be abstracted into a function call
            if hasattr(self.qsa, "_lambda"):
                for l in self.qsa.net.layer:
                    l.eligibility = np.zeros(l.eligibility.shape, dtype=np.float32)

            self.step = 0
            ##initialize s
            self.sim.reset_state()
            self.s = self.state_transformer.transform(self.sim.get_state())

            if self.do_recurrence:
                # choose a from s using policy derived from Q
                self.h = np.zeros(p["num_hidden"], dtype=np.float32)
                (self.a, self.qsa_tmp, self.h_prime) = self.choose_action_recurrence(np.append(self.s, self.h), p)
            else:
                # choose a from s using policy derived from Q
                (self.a, self.qsa_tmp) = self.choose_action(self.s, p)

            balance_list = []
            self.r_sum = 0.0
            # repeat steps
            quit = False
            save_and_exit = False

            while 1:
                ##take action a, observe r, s'
                a_vel = [0.0, -push_force, push_force]
                self.sim.set_action(a_vel[self.a])

                self.sim.step()
                # print("Terminal: " + str(self.sim.is_terminal))
                self.r = self.sim.get_reward(self.reward_type)
                self.s_prime = self.state_transformer.transform(self.sim.get_state())
                self.r_sum += self.r

                # for consistency, we always label balancing steps with the same reward function
                balance_list.append(self.sim.get_reward(0))

                if self.do_recurrence:
                    (self.a_prime, self.qsa_prime, self.h_primeprime) = self.choose_action_recurrence(
                        np.append(self.s_prime, self.h_prime), p
                    )

                    current_s = np.append(self.s, self.h)
                    next_s = np.append(self.s_prime, self.h_prime)
                    self.qsa.store(
                        current_s,
                        self.a,
                        self.qsa_tmp
                        + self.alpha * (self.r + self.gamma * self.qsa.load(next_s, self.a_prime) - self.qsa_tmp),
                    )
                else:
                    # choose a' from s' using policy derived from Q
                    (self.a_prime, self.qsa_prime) = self.choose_action(self.s_prime, p)

                    # Q(s,a) <- Q(s,a) + alpha[r + gamma*Q(s_prime,a_prime) - Q(s,a)]
                    # todo: qsa_prime can be saved and reused for qsa_tmp
                    # qsa_tmp = self.qsa.load(self.s,self.a)
                    # self.qsa.update(self.s,self.a,self.r,self.s_prime,self.a_prime,self.qsa_tmp)
                    self.qsa.store(
                        self.s,
                        self.a,
                        self.qsa_tmp
                        + self.alpha * (self.r + self.gamma * self.qsa.load(self.s_prime, self.a_prime) - self.qsa_tmp),
                    )

                if self.do_vis:
                    if not (self.episode % self.showevery):
                        self.fast_forward = False
                        v.delay_vis()
                        v.draw_cartpole(self.sim.get_state(), self.a, self.sim.get_reward(self.reward_type), self)
                        exit = v.update_vis()
                        if exit:
                            quit = True
                    elif self.step == 0 and not (self.episode % self.fastforwardskip):
                        self.fast_forward = True
                        v.delay_vis()
                        v.draw_cartpole(self.sim.get_state(), self.a, self.sim.get_reward(self.reward_type), self)
                        exit = v.update_vis()
                        if exit:
                            quit = True

                    # if(p.has_key('print_state_debug') and p['print_state_debug'] == True):
                    #    print("action: " + str(a) + " r: " + str(r) + \
                    #        " Qsa: " + str(self.qsa.load(s,a)) +  " state: " + str(s))
                    #    print("Qs0: " + str(self.qsa.load(s,0)))
                    #    print("Qs1: " + str(self.qsa.load(s,1)))
                    #    print("Qs2: " + str(self.qsa.load(s,2)))

                # TODO: put this printout stuff in a function
                # the self.episode > 0 check prevents a bug where some of the printouts are empty arrays before the first episode completes
                if self.do_running_printout and print_update_timer < time.time() - 1.0 and self.episode > 0:
                    self.do_running_printout()

                if self.episode >= p["train_episodes"]:
                    save_and_exit = True
                    quit = True

                if quit:
                    break
                if self.sim.is_terminal:
                    break
                if self.step > p["max_steps"]:
                    break
                ## s <- s';  a <-- a'
                self.s = self.s_prime
                self.a = self.a_prime
                self.qsa_tmp = self.qsa_prime
                if self.do_recurrence:
                    self.h = self.h_prime
                    self.h_prime = self.h_primeprime

                # print("Next Step \n")
                self.step += 1
                self.avg_step_duration = 0.995 * self.avg_step_duration + (1.0 - 0.995) * (
                    time.time() - step_duration_timer
                )
                step_duration_timer = time.time()
                # end step loop

            # compute the number of steps that have a positive reward, as the number of steps that balanced
            self.steps_balancing_pole = np.sum(np.array(balance_list) > 0.0000001)
            self.steps_balancing_pole_list.append(self.steps_balancing_pole)

            self.steps_balancing_pole_avg = (
                0.995 * self.steps_balancing_pole_avg + (1.0 - 0.995) * self.steps_balancing_pole
            )
            self.steps_balancing_pole_avg_list.append(self.steps_balancing_pole_avg)

            self.r_sum_avg = 0.995 * self.r_sum_avg + (1.0 - 0.995) * self.r_sum

            if p["decay_type"] == "geometric":
                self.epsilon = self.epsilon * p["epsilon_decay"]
                self.epsilon = max(p["epsilon_min"], self.epsilon)
            elif p["decay_type"] == "linear":
                self.epsilon = self.epsilon - p["epsilon_decay"]
                self.epsilon = max(p["epsilon_min"], self.epsilon)

            if p.has_key("learning_rate_decay_type") and p["learning_rate_decay_type"] == "geometric":
                self.alpha = self.alpha * p["learning_rate_decay"]
                self.alpha = max(p["learning_rate_min"] / p["learning_rate"], self.alpha)
            elif p.has_key("learning_rate_decay_type") and p["learning_rate_decay_type"] == "linear":
                self.alpha = self.alpha - p["learning_rate_decay"]
                self.alpha = max(p["learning_rate_min"] / p["learning_rate"], self.alpha)

            # print debug for episode
            m, s = divmod(time.time() - self.start_time, 60)
            h, m = divmod(m, 60)
            sys.stdout.write(
                ("ep: %d" % self.episode)
                + (" epsilon: %2.4f" % self.epsilon)
                + (" avg steps balanced: %2.4f" % self.steps_balancing_pole_avg)
                + (" max steps balanced: %2.4f" % np.max(np.array(self.steps_balancing_pole_avg_list)))
                + (" total_steps: %d" % self.step)
                + (" steps/sec: %2.4f" % (1.0 / self.avg_step_duration))
            )
            if hasattr(self.qsa, "net"):
                if hasattr(self.qsa.net.layer[0], "zeta"):
                    sys.stdout.write(" L0 zeta: %2.4f" % self.qsa.net.layer[0].zeta)
                if hasattr(self.qsa.net.layer[1], "zeta"):
                    sys.stdout.write(" L1 zeta: %2.4f" % self.qsa.net.layer[1].zeta)
            sys.stdout.write(" l_rate: %2.4f" % (self.alpha * p["learning_rate"]))
            print (" Time %d:%02d:%02d" % (h, m, s))
            sys.stdout.flush()

            for i in range(9):
                if (
                    self.earlyendepisode[i] > 0
                    and self.episode == self.earlyendepisode[i]
                    and np.max(np.array(self.steps_balancing_pole_avg_list)) < self.earlyendreward[i]
                ):
                    print ("ending early")
                    save_and_exit = True

            # save stuff (TODO: Put this in a save function)
            if time.time() - save_time > save_interval or save_and_exit == True:
                print ("saving results...")
                self.save_results(p["results_dir"] + p["simname"] + p["version"] + ".h5py", p)
                save_time = time.time()

            if quit == True or save_and_exit == True:
                break
            self.episode += 1
            # end episode loop

        self.update_results(p)
        obj = np.max(self.results["steps_balancing_pole_avg_list"])
        argmax = np.argmax(self.results["steps_balancing_pole_avg_list"])
        print ("obj: " + str(obj) + " argmax: " + str(argmax))
        return self.results
    def run_sim(self,p):

        print("Parameters: ")
        for k,v in p.items():
            if(k[0:2] == '__'):
                continue
            print(str(k) + " : " + str(v))
        del k
        del v


        #init random number generator from seed
        np.random.seed(p['random_seed']);
   
        #initialize hyperparameters fresh, unless we are resuming a saved simulation
        #in which case, we load the parameters
        if(not p.has_key('load_name')):
            self.init_sim(p)
        else:
            self.load_sim(p)

        #initialize environment
        self.sim = cartpole_environment()
        self.vel_bound = p['vel_bound']
        self.pos_bound = p['pos_bound']
        self.angle_vel_bound = p['angle_vel_bound']
        self.sim.init(self.vel_bound,self.angle_vel_bound,self.pos_bound,
            p['g'],p['l'],p['mp'],p['mc'],p['dt'],p['negative_reward'],p['positive_reward'],p['no_reward'],p.get('reward_type',0))

        self.do_vis = p['do_vis']
        self.save_images = p.get('save_images',False)
        self.image_save_dir = p.get('image_save_dir',None)
        save_interval = p['save_interval']
        self.do_running_printout = p.get('do_running_printout',False)

        self.showevery = p['showevery']
        self.fastforwardskip = 5
        push_force = p['push_force']

        self.reward_type = p.get('reward_type',0)

        self.use_full_output = p.get('use_full_output',False)

        self.earlyendepisode = np.zeros(10)
        self.earlyendreward = np.zeros(10)
        for i in range(9):
            self.earlyendepisode[i] = p.get('earlyendepisode' + str(i),0)
            self.earlyendreward[i] = p.get('earlyendreward' + str(i),0)

        self.do_recurrence = p.get('do_recurrence',False)

        if(self.do_vis):
            #only import if we need it, since we don't want to require installation of pygame
            from cartpole.vis.visualize_sdl import visualize_sdl
            v = visualize_sdl()
            v.init_vis(p['display_width'],p['display_height'],p['axis_x_min'],p['axis_x_max'],p['axis_y_min'],p['axis_y_max'],p['fps'])

        print_update_timer = time.time()
        self.start_time = time.time()
        elapsed_time = time.time()
        step_duration_timer = time.time()
        save_time = time.time()
        self.avg_step_duration = 1.0

        ##repeat for each episode
        self.r_sum_avg = -0.95
        self.r_sum_avg_list = []
        self.steps_balancing_pole_list = []
        self.steps_balancing_pole_avg = 0.00
        self.steps_balancing_pole_avg_list = []

        while 1:
            #reset eligibility at the beginning of each episode
            #TODO: This should be abstracted into a function call
            if(hasattr(self.qsa,'_lambda')):
                for l in self.qsa.net.layer:
                    l.eligibility = np.zeros(l.eligibility.shape,dtype=np.float32)

            self.step = 0 
            ##initialize s
            self.sim.reset_state()
            self.s = self.state_transformer.transform(self.sim.get_state())

            if(self.do_recurrence):
                #choose a from s using policy derived from Q
                self.h = np.zeros(p['num_hidden'],dtype=np.float32)
                (self.a,self.qsa_tmp,self.h_prime) = self.choose_action_recurrence(np.append(self.s,self.h),p);
            else:
                #choose a from s using policy derived from Q
                (self.a,self.qsa_tmp) = self.choose_action(self.s,p);

            balance_list = []
            self.r_sum = 0.0
            #repeat steps
            quit = False
            save_and_exit = False

            while 1:
                ##take action a, observe r, s'
                a_vel = [0.0,-push_force,push_force]
                self.sim.set_action(a_vel[self.a])

                self.sim.step()
                #print("Terminal: " + str(self.sim.is_terminal))
                self.r = self.sim.get_reward(self.reward_type)
                self.s_prime = self.state_transformer.transform(self.sim.get_state())
                self.r_sum += self.r
                
                #for consistency, we always label balancing steps with the same reward function
                balance_list.append(self.sim.get_reward(0))

                if(self.do_recurrence):
                    (self.a_prime,self.qsa_prime,self.h_primeprime) = \
                            self.choose_action_recurrence(np.append(self.s_prime,self.h_prime),p)

                    current_s = np.append(self.s,self.h)
                    next_s = np.append(self.s_prime,self.h_prime)
                    self.qsa.store(current_s,self.a,self.qsa_tmp +  \
                        self.alpha*(self.r + self.gamma*self.qsa.load(next_s,self.a_prime) - self.qsa_tmp))
                else:
                    #choose a' from s' using policy derived from Q
                    (self.a_prime,self.qsa_prime) = self.choose_action(self.s_prime,p)
                
                    #Q(s,a) <- Q(s,a) + alpha[r + gamma*Q(s_prime,a_prime) - Q(s,a)]
                    #todo: qsa_prime can be saved and reused for qsa_tmp
                    #qsa_tmp = self.qsa.load(self.s,self.a)
                    #self.qsa.update(self.s,self.a,self.r,self.s_prime,self.a_prime,self.qsa_tmp)
                    self.qsa.store(self.s,self.a,self.qsa_tmp +  \
                    self.alpha*(self.r + self.gamma*self.qsa.load(self.s_prime,self.a_prime) - self.qsa_tmp))
                
                if(self.do_vis):
                    if not (self.episode % self.showevery):
                        self.fast_forward = False
                        v.delay_vis()
                        v.draw_cartpole(self.sim.get_state(),self.a,self.sim.get_reward(self.reward_type),self)
                        exit = v.update_vis()
                        if(exit):
                            quit=True
                    elif(self.step == 0 and not (self.episode % self.fastforwardskip)):
                        self.fast_forward = True
                        v.delay_vis()
                        v.draw_cartpole(self.sim.get_state(),self.a,self.sim.get_reward(self.reward_type),self)
                        exit = v.update_vis()
                        if(exit):
                            quit=True
                        
                    #if(p.has_key('print_state_debug') and p['print_state_debug'] == True):
                    #    print("action: " + str(a) + " r: " + str(r) + \
                    #        " Qsa: " + str(self.qsa.load(s,a)) +  " state: " + str(s))
                    #    print("Qs0: " + str(self.qsa.load(s,0)))
                    #    print("Qs1: " + str(self.qsa.load(s,1)))
                    #    print("Qs2: " + str(self.qsa.load(s,2)))

                #TODO: put this printout stuff in a function
                #the self.episode > 0 check prevents a bug where some of the printouts are empty arrays before the first episode completes
                if(self.do_running_printout and print_update_timer < time.time() - 1.0 and self.episode > 0):
                    self.do_running_printout()

                if(self.episode >= p['train_episodes']):
                    save_and_exit = True
                    quit=True

                if(quit):
                    break
                if(self.sim.is_terminal):
                    break
                if(self.step > p['max_steps']):
                    break
                ## s <- s';  a <-- a'
                self.s = self.s_prime
                self.a = self.a_prime
                self.qsa_tmp = self.qsa_prime
                if(self.do_recurrence):
                    self.h = self.h_prime
                    self.h_prime = self.h_primeprime

                #print("Next Step \n")
                self.step += 1
                self.avg_step_duration = 0.995*self.avg_step_duration + (1.0 - 0.995)*(time.time() - step_duration_timer)
                step_duration_timer = time.time()
                #end step loop

            #compute the number of steps that have a positive reward, as the number of steps that balanced
            self.steps_balancing_pole = np.sum(np.array(balance_list) > 0.0000001)
            self.steps_balancing_pole_list.append(self.steps_balancing_pole)

            self.steps_balancing_pole_avg = 0.995*self.steps_balancing_pole_avg + (1.0 - 0.995)*self.steps_balancing_pole
            self.steps_balancing_pole_avg_list.append(self.steps_balancing_pole_avg)

            self.r_sum_avg = 0.995*self.r_sum_avg + (1.0 - 0.995)*self.r_sum
            
            if(p['decay_type'] == 'geometric'):
                self.epsilon = self.epsilon * p['epsilon_decay']
                self.epsilon = max(p['epsilon_min'],self.epsilon)
            elif(p['decay_type'] == 'linear'):
                self.epsilon = self.epsilon - p['epsilon_decay']
                self.epsilon = max(p['epsilon_min'],self.epsilon)
            

            if(p.has_key('learning_rate_decay_type') and p['learning_rate_decay_type'] == 'geometric'):
                self.alpha = self.alpha * p['learning_rate_decay']
                self.alpha = max(p['learning_rate_min']/p['learning_rate'],self.alpha)
            elif(p.has_key('learning_rate_decay_type') and p['learning_rate_decay_type'] == 'linear'):
                self.alpha = self.alpha - p['learning_rate_decay']
                self.alpha = max(p['learning_rate_min']/p['learning_rate'],self.alpha)

            #print debug for episode
            m, s = divmod(time.time() - self.start_time, 60)
            h, m = divmod(m, 60)
            sys.stdout.write(("ep: %d" % self.episode) + (" epsilon: %2.4f" %self.epsilon) + (" avg steps balanced: %2.4f" % self.steps_balancing_pole_avg) + (" max steps balanced: %2.4f" % np.max(np.array(self.steps_balancing_pole_avg_list))) + (" total_steps: %d" % self.step) + (" steps/sec: %2.4f" % (1.0/self.avg_step_duration)))
            if(hasattr(self.qsa,'net')):
                if(hasattr(self.qsa.net.layer[0],'zeta')):
                    sys.stdout.write(" L0 zeta: %2.4f" % self.qsa.net.layer[0].zeta)
                if(hasattr(self.qsa.net.layer[1],'zeta')):
                    sys.stdout.write(" L1 zeta: %2.4f" % self.qsa.net.layer[1].zeta)
            sys.stdout.write(" l_rate: %2.4f" % (self.alpha*p['learning_rate']))
            print(" Time %d:%02d:%02d" % (h, m, s))
            sys.stdout.flush()

            for i in range(9):
                if(self.earlyendepisode[i] > 0 and self.episode == self.earlyendepisode[i] and np.max(np.array(self.steps_balancing_pole_avg_list)) < self.earlyendreward[i]):
                    print("ending early")
                    save_and_exit = True

            #save stuff (TODO: Put this in a save function)
            if(time.time() - save_time > save_interval or save_and_exit == True):
                print('saving results...')
                self.save_results(p['results_dir'] + p['simname'] + p['version'] + '.h5py',p)
                save_time = time.time();

            if(quit==True or save_and_exit==True):
                break;
            self.episode += 1
            #end episode loop

        self.update_results(p)
        obj = np.max(self.results['steps_balancing_pole_avg_list'])
        argmax = np.argmax(self.results['steps_balancing_pole_avg_list'])
        print("obj: " + str(obj) + " argmax: " + str(argmax))
        return self.results
    def run_sim(self,p):
        sim = cartpole_environment()
        reward_type = p.get('reward_type',0)
        negative_reward = p['negative_reward']
        positive_reward = p['positive_reward']
        sim.init(p['vel_bound'],p['angle_vel_bound'],p['pos_bound'],p['g'],p['l'],p['mp'],p['mc'],p['dt'],p['negative_reward'],p['positive_reward'],p['no_reward'],reward_type)
        v = visualize_sdl()
        v.init_vis(p['display_width'],p['display_height'],p['axis_x_min'],p['axis_x_max'],p['axis_y_min'],p['axis_y_max'],p['fps'])
        push_force = p['push_force']

        self.vel_bound = p['vel_bound']
        self.pos_bound = p['pos_bound']
        self.angle_vel_bound = p['angle_vel_bound']
        self.mins = np.array([0.0, -self.vel_bound, -self.pos_bound, -self.angle_vel_bound])
        self.maxs = np.array([2*math.pi,  self.vel_bound,  self.pos_bound,  self.angle_vel_bound])
        self.mins = np.append(np.array([-1.0,-1.0]),self.mins[1:])
        self.maxs = np.append(np.array([1.0,1.0]),self.maxs[1:])
        self.incorrect_target = p['incorrect_target']
        self.correct_target = p['correct_target']
        self.num_actions = 3
        action=0

        while 1:
            if(p.has_key('print_state_debug')):
                clear()
                action_list = np.ones((1,self.num_actions))*self.incorrect_target
                action_list[0,action] = self.correct_target
                state = sim.sim.state
                s = np.append(np.array([math.sin(state[0]),math.cos(state[0])]),state[1:])
                s = (np.array(s) - self.mins)/(self.maxs - self.mins)
                s = s-0.5
                s = s*2.25
                s = np.append(s,action_list)
                np.set_printoptions(precision=4)
                print(str(s[:,np.newaxis]))
                print(str(np.array(sim.sim.state)[:,np.newaxis]))
            v.delay_vis()
            k = v.get_keys()

            u = 0.0;
            action = 0
            if(k[0]):
                action = 2
                u = -push_force;
            if(k[1]):
                action = 1
                u = push_force;
            sim.set_action(u)
            sim.step()
            #if(sim.state[2] < -4.0):
            #    sim.state[2] = -4.0
            #if(sim.state[2] > 4.0):
            #    sim.state[2] = 4.0
            if(sim.is_terminal):
                sim.reset_state()

            v.draw_cartpole(sim.get_state(),action,sim.get_reward(reward_type)/positive_reward)
            exit = v.update_vis()
            if(exit):
                break
        return