def learn_policy_fchc(self, num_iter, sigma, num_episodes): reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-3) curr_iter = 0 data = [] theta_max = [] global_max = -2**31 theta = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=sigma, condition=True) softmax_theta = np.exp(theta) softmax_theta = softmax_theta/np.sum(softmax_theta, axis=1)[:,None] j = self.evaluate(softmax_theta, num_episodes) while curr_iter < num_iter: print "-----------------------------" print "At ITER: ", curr_iter theta_sampled = util.sample(distribution='gaussian', theta=theta, sigma=sigma, reshape_param=reshape_param) softmax_theta = np.exp(theta_sampled) softmax_theta = softmax_theta/np.sum(softmax_theta, axis=1)[:,None] j_n = self.evaluate(softmax_theta, num_episodes) data.append(j_n) if j_n > j: theta = theta_sampled j = j_n print "MAX REWARD: ", j, " AT iter: ", curr_iter if j_n > global_max: global_max = j_n theta_max = theta print "GLOBAL MAX UPDATED: ", global_max, " AT iter: ", curr_iter print "-----------------------------" curr_iter += 1 print "Saving Data" pkl.dump(data, open("fchcFILE.pkl", 'w')) pkl.dump(theta_max, open("fchcTHETA.pkl", 'w'))
def learn_policy_bbo(self, init_population, best_ke, num_episodes, epsilon, num_iter, steps_per_trial=15, sigma=100): assert init_population >= best_ke assert num_episodes > 1 curr_iter = 0 reshape_param = (31, 2) data = [] theta_max = [] max_av_reward = -2**31 while (curr_iter < num_iter): theta, sigma = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=sigma) for i in range(steps_per_trial): values = [] print "-----------------------------" print "At ITER: ", curr_iter print "AT step: ", i theta_sampled = util.sample('gaussian', theta, sigma, reshape_param, init_population) theta_sampled = np.exp(theta_sampled) tic = time.time() for k in range(init_population): theta_k = theta_sampled[k] theta_k = theta_k / np.sum(theta_k, axis=1)[:, None] j_k = self.evaluate(theta_k, num_episodes) data.append(j_k) if j_k > max_av_reward: max_av_reward = j_k theta_max = theta_k print "MAX REWARD: ", max_av_reward, " AT step, iter: ", i, curr_iter values.append( (theta_k.reshape(reshape_param[0] * reshape_param[1], 1), j_k)) toc = time.time() print(toc - tic) values = sorted(values, key=lambda x: x[1], reverse=True) theta, sigma = util.generate_new_distribution( 'gaussian', theta, values, best_ke, epsilon) print "-----------------------------" curr_iter += 1 print "Saving Data" pkl.dump(data, open("FILE.pkl", 'w')) pkl.dump(theta_max, open("THETA.pkl", 'w'))
def learn_policy_fchc_multiprocessing(self, num_iter, steps_per_trial, sigma, num_episodes): reshape_param = (GetStateNumber(4,3,self.dimensions), len(self.actionSpace)-1) curr_iter = 0 while curr_iter < num_iter: theta, _ = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=sigma) j = self.evaluate(theta, num_episodes) for i in range(steps_per_trial): theta_sampled = util.sample(distribution='gaussian', theta=theta, sigma=sigma, reshape_param=reshape_param) softmax_theta = np.exp(theta_sampled) softmax_theta /= np.sum(softmax_theta, axis=1)[:,None] j_n = self.evaluate(theta_sampled, num_episodes) if j_n > j: theta = theta_sampled j = j_n
def learn_policy_bbo_multiprocessing(self, init_population, best_ke, num_episodes, epsilon, num_iter, steps_per_trial=15, variance=10): assert init_population >= best_ke assert num_episodes > 1 curr_iter = 0 reshape_param = (31, 2) data = [] theta_max = [] max_av_reward = -2**31 while (curr_iter < num_iter): theta, sigma = util.get_init(state_space=reshape_param[0], action_space=reshape_param[1], sigma=variance) for i in range(steps_per_trial): values = [] print "-----------------------------" print "At ITER: ", curr_iter print "AT step: ", i theta_sampled = util.sample('gaussian', theta, sigma, reshape_param, init_population) theta_sampled = variance * theta_sampled softmax_theta = np.exp(theta_sampled) tic = time.time() pool = Pool(multiprocessing.cpu_count()) mp_obj = multiprocessing_obj(num_episodes) values = pool.map(mp_obj, self.iterable(softmax_theta)) data.append(np.array(values)[:, 1].tolist()) pool.close() pool.join() toc = time.time() values = sorted(values, key=lambda x: x[1], reverse=True) print "Max reward: ", values[0][1] if max_av_reward < values[0][1]: max_av_reward = values[0][1] print "MAX REWARD UPDATED" theta_max = values[0][0] theta, sigma = util.generate_new_distribution( 'gaussian', theta, values, best_ke, epsilon) print "-----------------------------" curr_iter += 1 print "Saving data" pkl.dump(data, open("FILE.pkl", 'w')) pkl.dump(theta_max, open("THETA.pkl", 'w'))