コード例 #1
 def RewardFunction(self, s_t, a_t, s_t_1, time_step=0):
     Reward for taking action a_t in state s_t to get into state s_t_1
     reward = 0
     before = GetStateNumber(s_t[0], s_t[1], self.dimensions)
     after = GetStateNumber(s_t_1[0], s_t_1[1], self.dimensions)
     if before == after and before == 21:
         reward -= 10*(self.gamma**time_step)
     elif self.isTerminalState(s_t_1):
         reward += 10*(self.gamma**time_step)
     return reward
コード例 #2
 def init_e_function(self):
     self.e_vals = []
     for state in range(GetStateNumber(4, 4, self.dimensions) + 1):
         actions = [0,0,0]
     tmp = self.e_vals
     return tmp
コード例 #3
 def learn_policy_fchc(self, num_iter, sigma, num_episodes):
     reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-3)
     curr_iter = 0
     data = []
     theta_max = []
     global_max = -2**31
     theta = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=sigma, condition=True)
     softmax_theta = np.exp(theta)
     softmax_theta = softmax_theta/np.sum(softmax_theta, axis=1)[:,None]
     j = self.evaluate(softmax_theta, num_episodes)
     while curr_iter < num_iter:
         print "-----------------------------"
         print "At ITER: ", curr_iter
         theta_sampled = util.sample(distribution='gaussian', theta=theta, sigma=sigma, reshape_param=reshape_param)
         softmax_theta = np.exp(theta_sampled)
         softmax_theta = softmax_theta/np.sum(softmax_theta, axis=1)[:,None]
         j_n = self.evaluate(softmax_theta, num_episodes)
         if j_n > j:
             theta = theta_sampled
             j = j_n
             print "MAX REWARD: ", j, " AT iter: ", curr_iter
         if j_n > global_max:
             global_max = j_n
             theta_max = theta
             print "GLOBAL MAX UPDATED: ", global_max, " AT iter: ", curr_iter
         print "-----------------------------"
         curr_iter += 1
     print "Saving Data"
     pkl.dump(data, open("fchcFILE.pkl", 'w'))
     pkl.dump(theta_max, open("fchcTHETA.pkl", 'w'))
コード例 #4
 def learn_policy_fchc_multiprocessing(self, num_iter, steps_per_trial, sigma, num_episodes):
     reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-1)
     curr_iter = 0
     while curr_iter < num_iter:
         theta, _ = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=sigma)
         j = self.evaluate(theta, num_episodes)
         for i in range(steps_per_trial):
             theta_sampled = util.sample(distribution='gaussian', theta=theta, sigma=sigma, reshape_param=reshape_param)
             softmax_theta = np.exp(theta_sampled)
             softmax_theta /= np.sum(softmax_theta, axis=1)[:,None]
             j_n = self.evaluate(theta_sampled, num_episodes)
             if j_n > j:
                 theta = theta_sampled
                 j = j_n
コード例 #5
 def learn_policy_bbo_multiprocessing(self, init_population, best_ke, num_episodes, epsilon, num_iter, steps_per_trial=15, variance=100):
     assert init_population >= best_ke
     assert num_episodes > 1
     curr_iter = 0
     reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-3)
     data = []
     theta_max = []
     max_av_reward = -2**31
     while (curr_iter < num_iter):
         theta, sigma = util.get_init(state_space=reshape_param[0],action_space=reshape_param[1], sigma=variance)
         for i in range(steps_per_trial):
             values = []
             print "-----------------------------"
             print "At ITER: ", curr_iter
             print "AT step: ", i
             theta_sampled= util.sample('gaussian', theta, sigma, reshape_param, init_population)
             theta_sampled = variance*theta_sampled
             softmax_theta = np.exp(theta_sampled)
             tic = time.time()
             pool = Pool(multiprocessing.cpu_count())
             mp_obj = multiprocessing_obj(num_episodes)
             values = pool.map(mp_obj, self.iterable(softmax_theta))
             toc = time.time()
             values = sorted(values, key=lambda x: x[1], reverse=True)
             print "Max reward: ", values[0][1]
             if max_av_reward < values[0][1]:
                 max_av_reward = values[0][1]
                 print "MAX REWARD UPDATED"
                 theta_max = values[0][0]
             theta, sigma = util.generate_new_distribution('gaussian', theta, values, best_ke, epsilon)
             print "-----------------------------"
         curr_iter += 1
     print "Saving data"
     pkl.dump(data, open("FILE.pkl", 'w'))
     pkl.dump(theta_max, open("THETA.pkl", 'w'))
コード例 #6
 def learn_policy_bbo(self, init_population, best_ke, num_episodes, epsilon, num_iter, steps_per_trial=15, sigma=100):
     assert init_population >= best_ke
     assert num_episodes > 1
     max_av_reward = -2**31
     theta_max = []
     curr_iter = 0
     reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-1)
     data = []
     while (curr_iter < num_iter):
         theta, sigma = util.get_init(state_space=reshape_param[0],action_space=reshape_param[1], sigma=sigma)
         for i in range(steps_per_trial):
             values = []
             print "-----------------------------"
             print "At ITER: ", curr_iter
             print "AT step: ", i
             theta_sampled= util.sample('gaussian', theta, sigma, reshape_param, init_population)
             tic = time.time()
             for k in range(init_population):
                 theta_k = softmax_theta[k]
                 theta_k = theta_k/np.sum(theta_k, axis=1)[:,None]
                 j_k = self.evaluate(theta_k, num_episodes)
                 if j_k > max_av_reward:
                     max_av_reward = j_k
                     theta_max = theta_k
                     print "MAX REWARD: ", max_av_reward, " AT step, iter: ", i, curr_iter
                 values.append((theta_k.reshape(reshape_param[0]*reshape_param[1], 1), j_k))  
             toc = time.time()
             values = sorted(values, key=lambda x: x[1], reverse=True)
             theta, sigma = util.generate_new_distribution('gaussian', theta, values, best_ke, epsilon)
             print "-----------------------------"
         curr_iter += 1
     print "Saving Data"
     pkl.dump(data, open("FILE.pkl", 'w'))
     pkl.dump(theta_max, open("THETA.pkl", 'w'))
コード例 #7
 def getActionFromPolicy(self, state, policy='uniform'):
     if isinstance(policy, str) and policy == 'uniform':
         return random.randint(1,4)
         theta = policy
         s_t = GetStateNumber(state[0], state[1], self.dimensions)
         currRow = theta[s_t-1]
         random_number = 1.0*random.randint(0,99)/100
         action_array = sorted(zip(np.arange(len(currRow)), currRow), key=lambda x: x[1], reverse=True)
         prev_proba = 0
         for action, probability in action_array:
             prev_proba += probability
             if random_number <= prev_proba:
                 if self.debug:
                     print "Action Array: ", action_array
                     print "Rand number: ",random_number
                     print "Action selected: ", action + 2    
                 return action + 2
         if self.debug:
             print "!!!!!!!!! NOT RETURNING ANYTHING !!!!!!!!!"
             print "Action Array: ", action_array
             print "Rand number: ",random_number
             print "Action selected: ", "NOTHING"
コード例 #8
 def isTerminalState(self, state):
     return GetStateNumber(state[0], state[1], self.dimensions) == 23
コード例 #9
 def getStateId(self, state):
     return GetStateNumber(state[0], state[1], self.dimensions)
コード例 #10
 def initValueFunction(self):
     self.states = [0]*(GetStateNumber(4, 4, self.dimensions) + 1)
     return self.states
コード例 #11
 def init_q_function(self):
     self.q_vals = []
     for state in range(GetStateNumber(4, 4, self.dimensions) + 1):
         actions = np.random.uniform(0, 100, 3)
     return self.q_vals
コード例 #12
 def TransitionFunction(self, state, action):
     if self.debug:
         print "Coming with ACTION: ", self.actionSpace[action], " and at STATE: ", state[0], state[1], " STATE NUMBER ", GetStateNumber(state[0], state[1], self.dimensions)
     effect, proba = self.rollTheDice()
     if self.debug:
         print "Effect and Probability val: ", effect, proba
     action = self.affectWithProbability(action, effect)
     tempState = [state[0],state[1]]
     if action == "up":
         tempState[0] -= 1
     elif action == "down":
         tempState[0] += 1
     elif action == "right":
         tempState[1] += 1
     elif action == "left":
         tempState[1] -= 1
     elif action == "stay":
         action = 'stay'
         if self.debug:
             print "Choosing to stay because of failure"
         if self.debug:
             print "Invalid Action"
         return state
     if self.isValid(tempState):
         if self.debug:
             print "Action chosen: ", action
         return tempState
         if self.debug:
             print action, " Transitioning to Invalid state, choosing to STAY"
         return state