Python CartPole Examples, cartpole.CartPole Python Examples

Example #1

0

Show file

def cartpole_sampling(theta, cm, K, Ke, N, epsilon):
    theta_list = np.random.multivariate_normal(theta, cm, K)
    result_list = []
    for x in range(K):
        # concurrent_eval(theta_list, x, result_list, N)
        avg_reward = 0
        for i in range(N):
            cartpole = CartPole()
            cartpole.pi_params = theta_list[x].reshape(4, 2)
            epi = CartPoleEpisode(cartpole)
            avg_reward += epi.run_all_steps()
        result_list.append((theta_list[x], avg_reward / N))

    # print(sorted(result_list, key=lambda n: n[-1], reverse=True))
    elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke]
    # print(elite_list)
    theta_final = np.zeros(8)
    cm_final = epsilon * np.identity(8)
    J_final = 0
    for t in elite_list:
        theta_final += t[0]
        cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta]))
        J_final += t[1]
    theta_final /= Ke
    cm_final /= (epsilon + Ke)
    # print(cm_final)
    J_final /= Ke
    return theta_final, cm_final, J_final

Example #2

0

Show file

def evaluate(index):
    game = CartPole()
    actions = game.legal_actions
    dqn = DQN(actions)
    dqn.epsilon = 0
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("networks")
    if checkpoint:
        saver.restore(sess, checkpoint.all_model_checkpoint_paths[index])
        print "Loaded: %s" % checkpoint.all_model_checkpoint_paths[index]
    rewards = []
    for episode in range(200):
        state = game.newGame()
        totReward = 0
        for _ in range(400):
            if episode == 199:
                game.env.render()
            action = dqn.selectAction(state)
            actionNum = np.argmax(action)
            next_state, reward, game_over = game.next(actionNum)
            totReward += reward
            state = next_state
            if game_over:
                break
        rewards.append(totReward)
    print rewards
    print "Average %s, best %s" % (sum(rewards) / len(rewards), max(rewards))

Example #3

0

Show file

File: fchc.py Project: stonezhng/cs687-ReinforcementLearning-Homework

def cartpole_evaluate(table, N):
    avg_reward = 0
    for i in range(N):
        cartpole = CartPole()
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        avg_reward += epi.run_all_steps()
    return avg_reward / N

Example #4

0

Show file

File: multiproc_cartpole_fchc.py Project: stonezhng/cs687-ReinforcementLearning-Homework

def multi_cartpole_episode(table, l):
    for i in l:
        cartpole = CartPole()
        # print(i)
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        cp_q.put(epi.run_all_steps())
    return 0

Example #5

0

Show file

File: ce_cartpole.py Project: stonezhng/cs687-ReinforcementLearning-Homework

def cartpole_evaluate(t, N):
    reward_l = []
    for i in range(N):
        cartpole = CartPole()
        # print(i)
        cartpole.pi_params = t.reshape(4, 2)
        epi = CartPoleEpisode(cartpole)
        reward_l.append(epi.run_all_steps())

    return sum(reward_l) / N

Example #6

0

Show file

File: run_actor_critic_v1.py Project: gauthamvasan/OpenAI-Gym

def run_expt(alpha_v, alpha_pi, gamma, lmbda):
	# no. of episodes and runs
	num_episodes = 1000
	max_steps = 1000
	
	steps_per_episode = np.zeros((num_episodes, ))
	avg_steps = 0.0

	sess = tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))

	optimizer_critic = tf.train.GradientDescentOptimizer(learning_rate=alpha_v)
	optimizer_actor = tf.train.GradientDescentOptimizer(learning_rate=alpha_pi)

	ac_agent = actorCritic(sess, optimizer_critic, optimizer_actor, critic_network, actor_network, gamma * lmbda, state_dim, num_actions)
	cartpole_domain = CartPole()
	for current_episode in range(num_episodes):
		current_state = cartpole_domain.reset()
		rescaled_current_state = rescale_states(current_state[0], current_state[1], current_state[2], current_state[3])

		update_target = np.zeros(1)
		G = 0.0
		step = 0

		while current_state is not None and step < max_steps:
			a_t = ac_agent.sampleAction(np.array(rescaled_current_state).reshape(1, state_dim))
			r_t, next_state = cartpole_domain.move(a_t)

			G += (gamma * r_t)
			step += 1

			# v_current = ac_agent.predictValue(np.array(rescaled_current_state).reshape(1, state_dim))
			v_next = np.zeros(1)
			rescaled_next_state = None
			if next_state is not None:
				rescaled_next_state = rescale_states(next_state[0], next_state[1], next_state[2], next_state[3])
				v_next = ac_agent.predictValue(np.array(rescaled_next_state).reshape(1, state_dim))
				# print("v_next: {}".format(v_next))
			update_target = r_t + (gamma * v_next)
			# print("update_target: {}".format(update_target))
			# print ("update_target: {}".format(update_target))
			# delta = r_t + (gamma * v_next) - v_current
			# print ("delta_prime: {}".format(delta_prime))
			ac_agent.updateModel(np.array(rescaled_current_state).reshape(1, state_dim), np.array([a_t]), np.array(update_target))
			rescaled_current_state = np.copy(rescaled_next_state)
			current_state = next_state

		steps_per_episode[current_episode] = step
		avg_steps = avg_steps + step

	avg_steps = avg_steps * 1.0 / num_episodes
	
	sess.close()
	tf.reset_default_graph()
	
	return (avg_steps, steps_per_episode)

Example #7

0

Show file

def td_cp_single(f_order, alpha):
    d = 4

    cartpole = CartPole()

    print('cartpole ', f_order, ' td')

    weight = np.zeros((1, (f_order + 1) ** d))
    # update weight in 100 loops
    print('alpha = ', alpha)
    for x in range(100):
        s = cartpole.d_zero()
        count = 0
        while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
            a = cartpole.pi(s)
            new_s, r = cartpole.P_and_R(s, a)
            weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s,
                                                                                                f_order).T
            s = new_s
            print(weight)
            count += 1
    # calculate td in another 100 loops
    td_list = []
    for x in range(100):
        s = cartpole.d_zero()
        count = 0
        while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
            a = cartpole.pi(s)
            new_s, r = cartpole.P_and_R(s, a)
            td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2)
            s = new_s
            count += 1
        td_list.append(0)
    print('square td = ', np.mean(np.array(td_list)))

Example #8

0

Show file

def train():
    game = CartPole()
    actions = game.legal_actions
    dqn = DQN(actions)
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver()
    state = game.newGame()
    for episode in range(T):
        action = dqn.selectAction(state)
        actionNum = np.argmax(action)
        game.env.render()
        next_state, reward, game_over = game.next(actionNum)
        if game_over:
            dqn.storeExperience(state, action, 0, next_state, game_over)
            next_state = game.newGame()
        else:
            dqn.storeExperience(state, action, reward, next_state, game_over)

        ##TODO: sample a minibatch from the replay buffer
        state_batch = ???
        nextState_batch =???
        action_batch =???
        terminal_batch =???
        reward_batch =???

        y_batch = []
        Q_batch = sess.run(dqn.targetQNet.QValue, feed_dict = {dqn.targetQNet.stateInput: nextState_batch} )
        for i in range(len(minibatch)):
            terminal = terminal_batch[i]
            if terminal:
                y_batch.append(reward_batch[i])
            else:
                ## TODO: add the target to the list of targets for each element in the minibatch using Q update rule
                y_batch.append(???)
        currentQ_batch = sess.run(dqn.currentQNet.QValue,
                                  feed_dict = {dqn.currentQNet.stateInput: state_batch })

        sess.run(dqn.trainStep, feed_dict = {dqn.yInput: y_batch, dqn.actionInput: action_batch, dqn.currentQNet.stateInput: state_batch})
        state = next_state

        if episode % UPDATE_TIME == 0:
            sess.run(dqn.copyCurrentToTargetOperation())

        if episode % 25000 == 0:
            saver.save(sess, 'networks/' + 'dqn', global_step= episode)
        if dqn.epsilon > FINAL_EPSILON:
            ## TODO: decay epsilon which represents the probability of taking a random action
            dqn.epsilon -=

Example #9

0

Show file

def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros(
            (1, len(actions) * num_tilings * (tiles_per_tiling**len(s))))

    elif base == 'rbf':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * order**len(s)))

    for x in range(epoch):
        s = cartpole.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                    actions, eps)
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(
                pe.qw(w, new_s, actions, base, baseparams), actions, eps)
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards

Example #10

0

Show file

def td_cp(lrs, f_order):
    d = 4

    alpha_result = []
    cartpole = CartPole()

    print('cartpole ', f_order, ' td')

    # kth order Fourier Basis is defined as:
    for alpha in lrs:
        weight = np.zeros((1, (f_order + 1) ** d))
        # update weight in 100 loops
        print('alpha = ', alpha)
        for x in range(100):
            s = cartpole.d_zero()
            count = 0
            while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
                a = cartpole.pi(s)
                new_s, r = cartpole.P_and_R(s, a)
                weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s,
                                                                                                      f_order).T
                s = new_s
                count += 1
        # print(weight)

        # calculate td in another 100 loops
        td_list = []
        for x in range(100):
            s = cartpole.d_zero()
            count = 0
            while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
                a = cartpole.pi(s)
                new_s, r = cartpole.P_and_R(s, a)
                td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2)
                s = new_s
                count += 1
            td_list.append(0)

        msv = np.mean(np.array(td_list))
        print('square td = ', msv)
        if np.isnan(msv):
            alpha_result.append(1e100)
        else:
            alpha_result.append(msv)

    print('##########################')
    return alpha_result

Example #11

0

Show file

def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = cartpole.d_zero()

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                        actions, decaylambda(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = np.max(pe.qw(w, new_s, actions, base, baseparams))
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards

Example #12

0

Show file

from cartpole import CartPole

if __name__ == "__main__":
    cartpole = CartPole()
    cartpole.show()

Example #13

0

Show file

File: cartpole_simulator.py Project: sagibs0n/bonsai-sdk

        if self.iteration_count >= 200:
            terminal = True
        else:
            terminal = star.terminal(self.model.state)

        reward = star.reward(self.model.state, terminal)
        brain_state = star.state(self.model.state)

        return (brain_state, reward, terminal)


if __name__ == "__main__":
    config = bonsai_ai.Config(sys.argv)
    brain = bonsai_ai.Brain(config)

    model = CartPole()
    sim = CartpoleSimulator(brain, 'CartpoleSimulator', config)
    sim.model = model

    render = None
    if '--render' in sys.argv:
        log.info('rendering')
        from render import Viewer
        render = True
        viewer = Viewer()
        viewer.model = model

    log.info('starting simulation...')
    while sim.run():
        if render:
            viewer.update()

Example #14

0

Show file

File: model.py Project: scottstanfield/cartpole

#!/usr/bin/env python

import random
from cartpole import CartPole
cp = CartPole()
print(cp)

for i in range(20):
    cp.step(random.choice([True, False]))
    print(cp)

Example #15

0

Show file

Program: NFQ_EXAMPLE.PY
Date: Thursday, March  1 2012
Description: Test NFQ on my cartpole simulation.
"""

from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork
from cartpole import CartPole
import numpy as np

module = ActionValueNetwork(4,2)
learner = NFQ()
learner.explorer.epsilon = 0.4
agent = LearningAgent(module, learner)

env = CartPole()
cnt = 0
for i in range(1000):
    
    env.reset()
    print "Episode: %d, Count: %d" % (i,cnt)
    cnt = 0
    while not env.failure():
        agent.integrateObservation(env.observation())
        action = agent.getAction()
        pstate, paction, reward, state = env.move(action)
        cnt += 1
        agent.giveReward(reward)
    agent.learn(1)

Example #16

0

Show file

 def __init__(self) -> None:
     self.cartpole = CartPole()

Example #17

0

Show file

class CartPoleTraining:
    """ Training cartpole using cross entropy agoritham based on the code from the book
    'Deep Reinforcement Learning Hands-On'
    """
    Episode = namedtuple('Episode', field_names=['reward', 'steps'])
    EpisodeStep = namedtuple('EpisodeStep',
                             field_names=['observation', 'action'])

    def __init__(self) -> None:
        self.cartpole = CartPole()

    def iterate_batches(self, net, batch_size):
        batch = []
        episode_reward = 0.0
        episode_steps = []

        #start the episode
        self.cartpole.episode_start()
        state = self.cartpole.get_state()

        obs = self.cartpole.state_to_gym(state)

        sm = nn.Softmax(dim=1)
        while True:
            obs_v = torch.FloatTensor([obs])
            act_probs_v = sm(net(obs_v))
            act_probs = act_probs_v.data.numpy()[0]
            action = np.random.choice(len(act_probs), p=act_probs)

            bonsai_action = self.cartpole.gym_to_action(action)

            self.cartpole.episode_step(bonsai_action)

            is_done = self.cartpole.halted()
            reward = self.cartpole.get_last_reward()
            next_obs = self.cartpole.state_to_gym(self.cartpole.get_state())

            episode_reward += reward
            step = self.EpisodeStep(observation=obs, action=action)
            episode_steps.append(step)
            if is_done:
                e = self.Episode(reward=episode_reward, steps=episode_steps)
                batch.append(e)
                episode_reward = 0.0
                episode_steps = []
                self.cartpole.episode_finish("")

                self.cartpole.episode_start()
                state = self.cartpole.get_state()

                next_obs = self.cartpole.state_to_gym(state)

                if len(batch) == batch_size:
                    yield batch
                    batch = []
            obs = next_obs

    def filter_batch(self, batch, percentile):
        rewards = list(map(lambda s: s.reward, batch))
        reward_bound = np.percentile(rewards, percentile)
        reward_mean = float(np.mean(rewards))

        train_obs = []
        train_act = []
        for reward, steps in batch:
            if reward < reward_bound:
                continue
            train_obs.extend(map(lambda step: step.observation, steps))
            train_act.extend(map(lambda step: step.action, steps))

        train_obs_v = torch.FloatTensor(train_obs)
        train_act_v = torch.LongTensor(train_act)
        return train_obs_v, train_act_v, reward_bound, reward_mean

    def train(self):
        obs_size = self.cartpole._env.unwrapped.observation_space.shape[0]
        n_actions = self.cartpole._env.unwrapped.action_space.n

        net = Net(obs_size, HIDDEN_SIZE, n_actions)
        objective = nn.CrossEntropyLoss()
        optimizer = optim.Adam(params=net.parameters(), lr=0.01)
        writer = SummaryWriter(comment="-cartpole")

        for iter_no, batch in enumerate(self.iterate_batches(net, BATCH_SIZE)):
            obs_v, acts_v, reward_b, reward_m = self.filter_batch(
                batch, PERCENTILE)
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()

            #env.render()

            print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" %
                  (iter_no, loss_v.item(), reward_m, reward_b))
            writer.add_scalar("loss", loss_v.item(), iter_no)
            writer.add_scalar("reward_bound", reward_b, iter_no)
            writer.add_scalar("reward_mean", reward_m, iter_no)
            if reward_m > 199:
                print("Solved!")
                break
        writer.close()

Example #18

0

Show file

            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()

            #env.render()

            print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" %
                  (iter_no, loss_v.item(), reward_m, reward_b))
            writer.add_scalar("loss", loss_v.item(), iter_no)
            writer.add_scalar("reward_bound", reward_b, iter_no)
            writer.add_scalar("reward_mean", reward_m, iter_no)
            if reward_m > 199:
                print("Solved!")
                break
        writer.close()


if __name__ == '__main__':
    logging.basicConfig()
    log = logging.getLogger("cartpole")
    log.setLevel(level='INFO')

    cross_entropy_agent = CartPoleTraining()
    cross_entropy_agent.train()

    #TODO  save the model after training and load it in agent

    # we will use our environment (wrapper of OpenAI env)
    cartpole = CartPole()

Example #19

0

Show file

File: agent.py Project: myned-ai/bonsai-connectors

    def __init__(self, cartpole: CartPole):
        self.cartpole = cartpole

    def act(self, state):
        return cartpole.gym_to_action(cartpole._env.action_space.sample())


if __name__ == '__main__':
    logging.basicConfig()
    log = logging.getLogger("cartpole")
    log.setLevel(level='INFO')

    writer = SummaryWriter()
    # we will use our environment (wrapper of OpenAI env)
    cartpole = CartPole()

    # specify which agent you want to use,
    # BonsaiAgent that uses trained Brain or
    # RandomAgent that randomly selects next action
    agent = BonsaiAgent()

    episode_count = 100

    try:
        for i in range(episode_count):
            #start a new episode and get the new state
            cartpole.episode_start()
            state = cartpole.get_state()
            cum_reward = 0

Example #20

0

Show file

from alpha_agent import AlphaZero
from cartpole import CartPole

# create an env_creator function
env_creator = lambda: CartPole()

# define the config with the hyper-parameters
config = {
    'buffer_size': 1000,
    'batch_size': 256,
    'lr': 1e-3,
    'gamma': 0.997,
    'n_steps': 10,
    'num_epochs': 100,
    'num_episodes_per_epoch': 5,
    'learning_starts': 500,  # number of timesteps to sample before SGD
    'value_loss_coefficient': 0.2,
    'model_config': {
        'value_support_min_val': 0,
        'value_support_max_val': 30,
        'num_hidden': 32,
    },
    'mcts_config': {
        'num_simulations': 20,
        "temperature": 1.0,
        "c1_coefficient": 1.25,
        "c2_coefficient": 19652,
        'add_dirichlet_noise': True,
        'dir_noise': 0.5,
        'dir_epsilon': 0.2,
    }

Example #21

0

Show file

File: td_example.py Project: tanduong/td

Author: Jeremy M. Stober
Program: TD_EXAMPLE.PY
Date: Friday, February 24 2012
Description: Examples using TD algorithms to learn value functions.
"""

from gridworld.boyan import Boyan
from gridworld.chainwalk import Chainwalk
from cartpole import CartPole
from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac

# a simple environment
env = Boyan()
learner = TD(13, 0.1, 1.0, 0.8)
learner.learn(1000, env, env.random_policy)
print learner.V

env = Chainwalk()
learnerq = TDQ(2, 4, 0.1, 0.9, 0.8)

import pdb

env = CartPole()
#learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01)
#learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01)
#learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well.
learnerq = ActorCriticCmac(
    2, 0.5, 1.0, 0.95, 0.8, 0.9
)  # Clearly does some learning, but not nearly as well. Policy not as stable.
learnerq.learn(1000, env)