def cartpole_sampling(theta, cm, K, Ke, N, epsilon): theta_list = np.random.multivariate_normal(theta, cm, K) result_list = [] for x in range(K): # concurrent_eval(theta_list, x, result_list, N) avg_reward = 0 for i in range(N): cartpole = CartPole() cartpole.pi_params = theta_list[x].reshape(4, 2) epi = CartPoleEpisode(cartpole) avg_reward += epi.run_all_steps() result_list.append((theta_list[x], avg_reward / N)) # print(sorted(result_list, key=lambda n: n[-1], reverse=True)) elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke] # print(elite_list) theta_final = np.zeros(8) cm_final = epsilon * np.identity(8) J_final = 0 for t in elite_list: theta_final += t[0] cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta])) J_final += t[1] theta_final /= Ke cm_final /= (epsilon + Ke) # print(cm_final) J_final /= Ke return theta_final, cm_final, J_final
def evaluate(index): game = CartPole() actions = game.legal_actions dqn = DQN(actions) dqn.epsilon = 0 sess = tf.InteractiveSession() sess.run(tf.initialize_all_variables()) saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state("networks") if checkpoint: saver.restore(sess, checkpoint.all_model_checkpoint_paths[index]) print "Loaded: %s" % checkpoint.all_model_checkpoint_paths[index] rewards = [] for episode in range(200): state = game.newGame() totReward = 0 for _ in range(400): if episode == 199: game.env.render() action = dqn.selectAction(state) actionNum = np.argmax(action) next_state, reward, game_over = game.next(actionNum) totReward += reward state = next_state if game_over: break rewards.append(totReward) print rewards print "Average %s, best %s" % (sum(rewards) / len(rewards), max(rewards))
def cartpole_evaluate(table, N): avg_reward = 0 for i in range(N): cartpole = CartPole() cartpole.pi_params = table epi = CartPoleEpisode(cartpole) avg_reward += epi.run_all_steps() return avg_reward / N
def multi_cartpole_episode(table, l): for i in l: cartpole = CartPole() # print(i) cartpole.pi_params = table epi = CartPoleEpisode(cartpole) cp_q.put(epi.run_all_steps()) return 0
def cartpole_evaluate(t, N): reward_l = [] for i in range(N): cartpole = CartPole() # print(i) cartpole.pi_params = t.reshape(4, 2) epi = CartPoleEpisode(cartpole) reward_l.append(epi.run_all_steps()) return sum(reward_l) / N
def run_expt(alpha_v, alpha_pi, gamma, lmbda): # no. of episodes and runs num_episodes = 1000 max_steps = 1000 steps_per_episode = np.zeros((num_episodes, )) avg_steps = 0.0 sess = tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)) optimizer_critic = tf.train.GradientDescentOptimizer(learning_rate=alpha_v) optimizer_actor = tf.train.GradientDescentOptimizer(learning_rate=alpha_pi) ac_agent = actorCritic(sess, optimizer_critic, optimizer_actor, critic_network, actor_network, gamma * lmbda, state_dim, num_actions) cartpole_domain = CartPole() for current_episode in range(num_episodes): current_state = cartpole_domain.reset() rescaled_current_state = rescale_states(current_state[0], current_state[1], current_state[2], current_state[3]) update_target = np.zeros(1) G = 0.0 step = 0 while current_state is not None and step < max_steps: a_t = ac_agent.sampleAction(np.array(rescaled_current_state).reshape(1, state_dim)) r_t, next_state = cartpole_domain.move(a_t) G += (gamma * r_t) step += 1 # v_current = ac_agent.predictValue(np.array(rescaled_current_state).reshape(1, state_dim)) v_next = np.zeros(1) rescaled_next_state = None if next_state is not None: rescaled_next_state = rescale_states(next_state[0], next_state[1], next_state[2], next_state[3]) v_next = ac_agent.predictValue(np.array(rescaled_next_state).reshape(1, state_dim)) # print("v_next: {}".format(v_next)) update_target = r_t + (gamma * v_next) # print("update_target: {}".format(update_target)) # print ("update_target: {}".format(update_target)) # delta = r_t + (gamma * v_next) - v_current # print ("delta_prime: {}".format(delta_prime)) ac_agent.updateModel(np.array(rescaled_current_state).reshape(1, state_dim), np.array([a_t]), np.array(update_target)) rescaled_current_state = np.copy(rescaled_next_state) current_state = next_state steps_per_episode[current_episode] = step avg_steps = avg_steps + step avg_steps = avg_steps * 1.0 / num_episodes sess.close() tf.reset_default_graph() return (avg_steps, steps_per_episode)
def td_cp_single(f_order, alpha): d = 4 cartpole = CartPole() print('cartpole ', f_order, ' td') weight = np.zeros((1, (f_order + 1) ** d)) # update weight in 100 loops print('alpha = ', alpha) for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s, f_order).T s = new_s print(weight) count += 1 # calculate td in another 100 loops td_list = [] for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2) s = new_s count += 1 td_list.append(0) print('square td = ', np.mean(np.array(td_list)))
def train(): game = CartPole() actions = game.legal_actions dqn = DQN(actions) sess = tf.InteractiveSession() sess.run(tf.initialize_all_variables()) saver = tf.train.Saver() state = game.newGame() for episode in range(T): action = dqn.selectAction(state) actionNum = np.argmax(action) game.env.render() next_state, reward, game_over = game.next(actionNum) if game_over: dqn.storeExperience(state, action, 0, next_state, game_over) next_state = game.newGame() else: dqn.storeExperience(state, action, reward, next_state, game_over) ##TODO: sample a minibatch from the replay buffer state_batch = ??? nextState_batch =??? action_batch =??? terminal_batch =??? reward_batch =??? y_batch = [] Q_batch = sess.run(dqn.targetQNet.QValue, feed_dict = {dqn.targetQNet.stateInput: nextState_batch} ) for i in range(len(minibatch)): terminal = terminal_batch[i] if terminal: y_batch.append(reward_batch[i]) else: ## TODO: add the target to the list of targets for each element in the minibatch using Q update rule y_batch.append(???) currentQ_batch = sess.run(dqn.currentQNet.QValue, feed_dict = {dqn.currentQNet.stateInput: state_batch }) sess.run(dqn.trainStep, feed_dict = {dqn.yInput: y_batch, dqn.actionInput: action_batch, dqn.currentQNet.stateInput: state_batch}) state = next_state if episode % UPDATE_TIME == 0: sess.run(dqn.copyCurrentToTargetOperation()) if episode % 25000 == 0: saver.save(sess, 'networks/' + 'dqn', global_step= episode) if dqn.epsilon > FINAL_EPSILON: ## TODO: decay epsilon which represents the probability of taking a random action dqn.epsilon -=
def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'): cartpole = CartPole() estimated_rewards = np.zeros(epoch) actions = cartpole.actions w = None if base == 'fourier': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = cartpole.d_zero() w = np.zeros( (1, len(actions) * num_tilings * (tiles_per_tiling**len(s)))) elif base == 'rbf': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * order**len(s))) for x in range(epoch): s = cartpole.d_zero() # choose a from s using a policy derived from q (e.g., ε-greedy or softmax); first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams), actions, eps) # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps) a = np.random.choice(actions, 1, p=first_q)[0] count = 0 while np.abs(s[0]) < cartpole.edge and np.abs( s[1]) < cartpole.fail_angle and count < 1010: # Take action a and observe r and s′; new_s, r = cartpole.P_and_R(s, a) # Choose a′ from s′ using a policy derived from q; pi_temp = pe.epsilon_greedy( pe.qw(w, new_s, actions, base, baseparams), actions, eps) new_a = np.random.choice(actions, 1, p=pi_temp)[0] # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0] q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s a = new_a count += 1 epi = CartPoleEpisode(cartpole) estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) # print('episode: ', x, ', w: ', w) return estimated_rewards
def td_cp(lrs, f_order): d = 4 alpha_result = [] cartpole = CartPole() print('cartpole ', f_order, ' td') # kth order Fourier Basis is defined as: for alpha in lrs: weight = np.zeros((1, (f_order + 1) ** d)) # update weight in 100 loops print('alpha = ', alpha) for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s, f_order).T s = new_s count += 1 # print(weight) # calculate td in another 100 loops td_list = [] for x in range(100): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010: a = cartpole.pi(s) new_s, r = cartpole.P_and_R(s, a) td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2) s = new_s count += 1 td_list.append(0) msv = np.mean(np.array(td_list)) print('square td = ', msv) if np.isnan(msv): alpha_result.append(1e100) else: alpha_result.append(msv) print('##########################') return alpha_result
def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'): cartpole = CartPole() estimated_rewards = np.zeros(epoch) actions = cartpole.actions w = None if base == 'fourier': order = baseparams['order'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * (order + 1)**len(s))) elif base == 'tile': num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[ 'tiles_per_tiling'] s = cartpole.d_zero() w = np.zeros((1, len(actions) * num_tilings)) for x in range(epoch): s = cartpole.d_zero() count = 0 while np.abs(s[0]) < cartpole.edge and np.abs( s[1]) < cartpole.fail_angle and count < 1010: # Choose a′ from s′ using a policy derived from q; pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams), actions, decaylambda(x)) a = np.random.choice(actions, 1, p=pi_temp)[0] # Take action a and observe r and s′; new_s, r = cartpole.P_and_R(s, a) # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) - # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions) new_q = np.max(pe.qw(w, new_s, actions, base, baseparams)) q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams) w += lr * (r + new_q - q) * dqdw s = new_s count += 1 epi = CartPoleEpisode(cartpole) estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base, baseparams) print('episode: ', x, ', reward: ', estimated_rewards[x]) # print('episode: ', x, ', w: ', w) return estimated_rewards
from cartpole import CartPole if __name__ == "__main__": cartpole = CartPole() cartpole.show()
if self.iteration_count >= 200: terminal = True else: terminal = star.terminal(self.model.state) reward = star.reward(self.model.state, terminal) brain_state = star.state(self.model.state) return (brain_state, reward, terminal) if __name__ == "__main__": config = bonsai_ai.Config(sys.argv) brain = bonsai_ai.Brain(config) model = CartPole() sim = CartpoleSimulator(brain, 'CartpoleSimulator', config) sim.model = model render = None if '--render' in sys.argv: log.info('rendering') from render import Viewer render = True viewer = Viewer() viewer.model = model log.info('starting simulation...') while sim.run(): if render: viewer.update()
#!/usr/bin/env python import random from cartpole import CartPole cp = CartPole() print(cp) for i in range(20): cp.step(random.choice([True, False])) print(cp)
Program: NFQ_EXAMPLE.PY Date: Thursday, March 1 2012 Description: Test NFQ on my cartpole simulation. """ from pybrain.rl.agents import LearningAgent from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from cartpole import CartPole import numpy as np module = ActionValueNetwork(4,2) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) env = CartPole() cnt = 0 for i in range(1000): env.reset() print "Episode: %d, Count: %d" % (i,cnt) cnt = 0 while not env.failure(): agent.integrateObservation(env.observation()) action = agent.getAction() pstate, paction, reward, state = env.move(action) cnt += 1 agent.giveReward(reward) agent.learn(1)
def __init__(self) -> None: self.cartpole = CartPole()
class CartPoleTraining: """ Training cartpole using cross entropy agoritham based on the code from the book 'Deep Reinforcement Learning Hands-On' """ Episode = namedtuple('Episode', field_names=['reward', 'steps']) EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action']) def __init__(self) -> None: self.cartpole = CartPole() def iterate_batches(self, net, batch_size): batch = [] episode_reward = 0.0 episode_steps = [] #start the episode self.cartpole.episode_start() state = self.cartpole.get_state() obs = self.cartpole.state_to_gym(state) sm = nn.Softmax(dim=1) while True: obs_v = torch.FloatTensor([obs]) act_probs_v = sm(net(obs_v)) act_probs = act_probs_v.data.numpy()[0] action = np.random.choice(len(act_probs), p=act_probs) bonsai_action = self.cartpole.gym_to_action(action) self.cartpole.episode_step(bonsai_action) is_done = self.cartpole.halted() reward = self.cartpole.get_last_reward() next_obs = self.cartpole.state_to_gym(self.cartpole.get_state()) episode_reward += reward step = self.EpisodeStep(observation=obs, action=action) episode_steps.append(step) if is_done: e = self.Episode(reward=episode_reward, steps=episode_steps) batch.append(e) episode_reward = 0.0 episode_steps = [] self.cartpole.episode_finish("") self.cartpole.episode_start() state = self.cartpole.get_state() next_obs = self.cartpole.state_to_gym(state) if len(batch) == batch_size: yield batch batch = [] obs = next_obs def filter_batch(self, batch, percentile): rewards = list(map(lambda s: s.reward, batch)) reward_bound = np.percentile(rewards, percentile) reward_mean = float(np.mean(rewards)) train_obs = [] train_act = [] for reward, steps in batch: if reward < reward_bound: continue train_obs.extend(map(lambda step: step.observation, steps)) train_act.extend(map(lambda step: step.action, steps)) train_obs_v = torch.FloatTensor(train_obs) train_act_v = torch.LongTensor(train_act) return train_obs_v, train_act_v, reward_bound, reward_mean def train(self): obs_size = self.cartpole._env.unwrapped.observation_space.shape[0] n_actions = self.cartpole._env.unwrapped.action_space.n net = Net(obs_size, HIDDEN_SIZE, n_actions) objective = nn.CrossEntropyLoss() optimizer = optim.Adam(params=net.parameters(), lr=0.01) writer = SummaryWriter(comment="-cartpole") for iter_no, batch in enumerate(self.iterate_batches(net, BATCH_SIZE)): obs_v, acts_v, reward_b, reward_m = self.filter_batch( batch, PERCENTILE) optimizer.zero_grad() action_scores_v = net(obs_v) loss_v = objective(action_scores_v, acts_v) loss_v.backward() optimizer.step() #env.render() print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b)) writer.add_scalar("loss", loss_v.item(), iter_no) writer.add_scalar("reward_bound", reward_b, iter_no) writer.add_scalar("reward_mean", reward_m, iter_no) if reward_m > 199: print("Solved!") break writer.close()
action_scores_v = net(obs_v) loss_v = objective(action_scores_v, acts_v) loss_v.backward() optimizer.step() #env.render() print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" % (iter_no, loss_v.item(), reward_m, reward_b)) writer.add_scalar("loss", loss_v.item(), iter_no) writer.add_scalar("reward_bound", reward_b, iter_no) writer.add_scalar("reward_mean", reward_m, iter_no) if reward_m > 199: print("Solved!") break writer.close() if __name__ == '__main__': logging.basicConfig() log = logging.getLogger("cartpole") log.setLevel(level='INFO') cross_entropy_agent = CartPoleTraining() cross_entropy_agent.train() #TODO save the model after training and load it in agent # we will use our environment (wrapper of OpenAI env) cartpole = CartPole()
def __init__(self, cartpole: CartPole): self.cartpole = cartpole def act(self, state): return cartpole.gym_to_action(cartpole._env.action_space.sample()) if __name__ == '__main__': logging.basicConfig() log = logging.getLogger("cartpole") log.setLevel(level='INFO') writer = SummaryWriter() # we will use our environment (wrapper of OpenAI env) cartpole = CartPole() # specify which agent you want to use, # BonsaiAgent that uses trained Brain or # RandomAgent that randomly selects next action agent = BonsaiAgent() episode_count = 100 try: for i in range(episode_count): #start a new episode and get the new state cartpole.episode_start() state = cartpole.get_state() cum_reward = 0
from alpha_agent import AlphaZero from cartpole import CartPole # create an env_creator function env_creator = lambda: CartPole() # define the config with the hyper-parameters config = { 'buffer_size': 1000, 'batch_size': 256, 'lr': 1e-3, 'gamma': 0.997, 'n_steps': 10, 'num_epochs': 100, 'num_episodes_per_epoch': 5, 'learning_starts': 500, # number of timesteps to sample before SGD 'value_loss_coefficient': 0.2, 'model_config': { 'value_support_min_val': 0, 'value_support_max_val': 30, 'num_hidden': 32, }, 'mcts_config': { 'num_simulations': 20, "temperature": 1.0, "c1_coefficient": 1.25, "c2_coefficient": 19652, 'add_dirichlet_noise': True, 'dir_noise': 0.5, 'dir_epsilon': 0.2, }
Author: Jeremy M. Stober Program: TD_EXAMPLE.PY Date: Friday, February 24 2012 Description: Examples using TD algorithms to learn value functions. """ from gridworld.boyan import Boyan from gridworld.chainwalk import Chainwalk from cartpole import CartPole from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac # a simple environment env = Boyan() learner = TD(13, 0.1, 1.0, 0.8) learner.learn(1000, env, env.random_policy) print learner.V env = Chainwalk() learnerq = TDQ(2, 4, 0.1, 0.9, 0.8) import pdb env = CartPole() #learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01) #learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01) #learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well. learnerq = ActorCriticCmac( 2, 0.5, 1.0, 0.95, 0.8, 0.9 ) # Clearly does some learning, but not nearly as well. Policy not as stable. learnerq.learn(1000, env)