def learn_policy_fchc(self, num_iter, sigma, num_episodes):
     reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-3)
     curr_iter = 0
     data = []
     theta_max = []
     global_max = -2**31
     theta = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=sigma, condition=True)
     softmax_theta = np.exp(theta)
     softmax_theta = softmax_theta/np.sum(softmax_theta, axis=1)[:,None]
     j = self.evaluate(softmax_theta, num_episodes)
             
     while curr_iter < num_iter:
         print "-----------------------------"
         print "At ITER: ", curr_iter
         theta_sampled = util.sample(distribution='gaussian', theta=theta, sigma=sigma, reshape_param=reshape_param)
         softmax_theta = np.exp(theta_sampled)
         softmax_theta = softmax_theta/np.sum(softmax_theta, axis=1)[:,None]
         j_n = self.evaluate(softmax_theta, num_episodes)
         data.append(j_n)
         if j_n > j:
             theta = theta_sampled
             j = j_n
             print "MAX REWARD: ", j, " AT iter: ", curr_iter
         if j_n > global_max:
             global_max = j_n
             theta_max = theta
             print "GLOBAL MAX UPDATED: ", global_max, " AT iter: ", curr_iter
         print "-----------------------------"
         curr_iter += 1
     print "Saving Data"
     pkl.dump(data, open("fchcFILE.pkl", 'w'))
     pkl.dump(theta_max, open("fchcTHETA.pkl", 'w'))
Beispiel #2
0
 def learn_policy_bbo(self,
                      init_population,
                      best_ke,
                      num_episodes,
                      epsilon,
                      num_iter,
                      steps_per_trial=15,
                      sigma=100):
     assert init_population >= best_ke
     assert num_episodes > 1
     curr_iter = 0
     reshape_param = (31, 2)
     data = []
     theta_max = []
     max_av_reward = -2**31
     while (curr_iter < num_iter):
         theta, sigma = util.get_init(state_space=reshape_param[0],
                                      action_space=reshape_param[1],
                                      sigma=sigma)
         for i in range(steps_per_trial):
             values = []
             print "-----------------------------"
             print "At ITER: ", curr_iter
             print "AT step: ", i
             theta_sampled = util.sample('gaussian', theta, sigma,
                                         reshape_param, init_population)
             theta_sampled = np.exp(theta_sampled)
             tic = time.time()
             for k in range(init_population):
                 theta_k = theta_sampled[k]
                 theta_k = theta_k / np.sum(theta_k, axis=1)[:, None]
                 j_k = self.evaluate(theta_k, num_episodes)
                 data.append(j_k)
                 if j_k > max_av_reward:
                     max_av_reward = j_k
                     theta_max = theta_k
                     print "MAX REWARD: ", max_av_reward, " AT step, iter: ", i, curr_iter
                 values.append(
                     (theta_k.reshape(reshape_param[0] * reshape_param[1],
                                      1), j_k))
             toc = time.time()
             print(toc - tic)
             values = sorted(values, key=lambda x: x[1], reverse=True)
             theta, sigma = util.generate_new_distribution(
                 'gaussian', theta, values, best_ke, epsilon)
             print "-----------------------------"
         curr_iter += 1
     print "Saving Data"
     pkl.dump(data, open("FILE.pkl", 'w'))
     pkl.dump(theta_max, open("THETA.pkl", 'w'))
 def learn_policy_fchc_multiprocessing(self, num_iter, steps_per_trial, sigma, num_episodes):
     reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-1)
     curr_iter = 0
     while curr_iter < num_iter:
         theta, _ = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=sigma)
         j = self.evaluate(theta, num_episodes)
         for i in range(steps_per_trial):
             theta_sampled = util.sample(distribution='gaussian', theta=theta, sigma=sigma, reshape_param=reshape_param)
             softmax_theta = np.exp(theta_sampled)
             softmax_theta /= np.sum(softmax_theta, axis=1)[:,None]
             j_n = self.evaluate(theta_sampled, num_episodes)
             if j_n > j:
                 theta = theta_sampled
                 j = j_n
Beispiel #4
0
 def learn_policy_bbo_multiprocessing(self,
                                      init_population,
                                      best_ke,
                                      num_episodes,
                                      epsilon,
                                      num_iter,
                                      steps_per_trial=15,
                                      variance=10):
     assert init_population >= best_ke
     assert num_episodes > 1
     curr_iter = 0
     reshape_param = (31, 2)
     data = []
     theta_max = []
     max_av_reward = -2**31
     while (curr_iter < num_iter):
         theta, sigma = util.get_init(state_space=reshape_param[0],
                                      action_space=reshape_param[1],
                                      sigma=variance)
         for i in range(steps_per_trial):
             values = []
             print "-----------------------------"
             print "At ITER: ", curr_iter
             print "AT step: ", i
             theta_sampled = util.sample('gaussian', theta, sigma,
                                         reshape_param, init_population)
             theta_sampled = variance * theta_sampled
             softmax_theta = np.exp(theta_sampled)
             tic = time.time()
             pool = Pool(multiprocessing.cpu_count())
             mp_obj = multiprocessing_obj(num_episodes)
             values = pool.map(mp_obj, self.iterable(softmax_theta))
             data.append(np.array(values)[:, 1].tolist())
             pool.close()
             pool.join()
             toc = time.time()
             values = sorted(values, key=lambda x: x[1], reverse=True)
             print "Max reward: ", values[0][1]
             if max_av_reward < values[0][1]:
                 max_av_reward = values[0][1]
                 print "MAX REWARD UPDATED"
                 theta_max = values[0][0]
             theta, sigma = util.generate_new_distribution(
                 'gaussian', theta, values, best_ke, epsilon)
             print "-----------------------------"
         curr_iter += 1
     print "Saving data"
     pkl.dump(data, open("FILE.pkl", 'w'))
     pkl.dump(theta_max, open("THETA.pkl", 'w'))