Example #1
0
def cartpole_sampling(theta, cm, K, Ke, N, epsilon):
    theta_list = np.random.multivariate_normal(theta, cm, K)
    result_list = []
    for x in range(K):
        # concurrent_eval(theta_list, x, result_list, N)
        avg_reward = 0
        for i in range(N):
            cartpole = CartPole()
            cartpole.pi_params = theta_list[x].reshape(4, 2)
            epi = CartPoleEpisode(cartpole)
            avg_reward += epi.run_all_steps()
        result_list.append((theta_list[x], avg_reward / N))

    # print(sorted(result_list, key=lambda n: n[-1], reverse=True))
    elite_list = sorted(result_list, key=lambda n: n[-1], reverse=True)[:Ke]
    # print(elite_list)
    theta_final = np.zeros(8)
    cm_final = epsilon * np.identity(8)
    J_final = 0
    for t in elite_list:
        theta_final += t[0]
        cm_final += np.array([t[0] - theta]).T.dot(np.array([t[0] - theta]))
        J_final += t[1]
    theta_final /= Ke
    cm_final /= (epsilon + Ke)
    # print(cm_final)
    J_final /= Ke
    return theta_final, cm_final, J_final
Example #2
0
def evaluate(index):
    game = CartPole()
    actions = game.legal_actions
    dqn = DQN(actions)
    dqn.epsilon = 0
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("networks")
    if checkpoint:
        saver.restore(sess, checkpoint.all_model_checkpoint_paths[index])
        print "Loaded: %s" % checkpoint.all_model_checkpoint_paths[index]
    rewards = []
    for episode in range(200):
        state = game.newGame()
        totReward = 0
        for _ in range(400):
            if episode == 199:
                game.env.render()
            action = dqn.selectAction(state)
            actionNum = np.argmax(action)
            next_state, reward, game_over = game.next(actionNum)
            totReward += reward
            state = next_state
            if game_over:
                break
        rewards.append(totReward)
    print rewards
    print "Average %s, best %s" % (sum(rewards) / len(rewards), max(rewards))
def cartpole_evaluate(table, N):
    avg_reward = 0
    for i in range(N):
        cartpole = CartPole()
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        avg_reward += epi.run_all_steps()
    return avg_reward / N
def multi_cartpole_episode(table, l):
    for i in l:
        cartpole = CartPole()
        # print(i)
        cartpole.pi_params = table
        epi = CartPoleEpisode(cartpole)
        cp_q.put(epi.run_all_steps())
    return 0
def cartpole_evaluate(t, N):
    reward_l = []
    for i in range(N):
        cartpole = CartPole()
        # print(i)
        cartpole.pi_params = t.reshape(4, 2)
        epi = CartPoleEpisode(cartpole)
        reward_l.append(epi.run_all_steps())

    return sum(reward_l) / N
def run_expt(alpha_v, alpha_pi, gamma, lmbda):
	# no. of episodes and runs
	num_episodes = 1000
	max_steps = 1000
	
	steps_per_episode = np.zeros((num_episodes, ))
	avg_steps = 0.0

	sess = tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))

	optimizer_critic = tf.train.GradientDescentOptimizer(learning_rate=alpha_v)
	optimizer_actor = tf.train.GradientDescentOptimizer(learning_rate=alpha_pi)

	ac_agent = actorCritic(sess, optimizer_critic, optimizer_actor, critic_network, actor_network, gamma * lmbda, state_dim, num_actions)
	cartpole_domain = CartPole()
	for current_episode in range(num_episodes):
		current_state = cartpole_domain.reset()
		rescaled_current_state = rescale_states(current_state[0], current_state[1], current_state[2], current_state[3])

		update_target = np.zeros(1)
		G = 0.0
		step = 0

		while current_state is not None and step < max_steps:
			a_t = ac_agent.sampleAction(np.array(rescaled_current_state).reshape(1, state_dim))
			r_t, next_state = cartpole_domain.move(a_t)

			G += (gamma * r_t)
			step += 1

			# v_current = ac_agent.predictValue(np.array(rescaled_current_state).reshape(1, state_dim))
			v_next = np.zeros(1)
			rescaled_next_state = None
			if next_state is not None:
				rescaled_next_state = rescale_states(next_state[0], next_state[1], next_state[2], next_state[3])
				v_next = ac_agent.predictValue(np.array(rescaled_next_state).reshape(1, state_dim))
				# print("v_next: {}".format(v_next))
			update_target = r_t + (gamma * v_next)
			# print("update_target: {}".format(update_target))
			# print ("update_target: {}".format(update_target))
			# delta = r_t + (gamma * v_next) - v_current
			# print ("delta_prime: {}".format(delta_prime))
			ac_agent.updateModel(np.array(rescaled_current_state).reshape(1, state_dim), np.array([a_t]), np.array(update_target))
			rescaled_current_state = np.copy(rescaled_next_state)
			current_state = next_state

		steps_per_episode[current_episode] = step
		avg_steps = avg_steps + step

	avg_steps = avg_steps * 1.0 / num_episodes
	
	sess.close()
	tf.reset_default_graph()
	
	return (avg_steps, steps_per_episode)
Example #7
0
def td_cp_single(f_order, alpha):
    d = 4

    cartpole = CartPole()

    print('cartpole ', f_order, ' td')

    weight = np.zeros((1, (f_order + 1) ** d))
    # update weight in 100 loops
    print('alpha = ', alpha)
    for x in range(100):
        s = cartpole.d_zero()
        count = 0
        while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
            a = cartpole.pi(s)
            new_s, r = cartpole.P_and_R(s, a)
            weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s,
                                                                                                f_order).T
            s = new_s
            print(weight)
            count += 1
    # calculate td in another 100 loops
    td_list = []
    for x in range(100):
        s = cartpole.d_zero()
        count = 0
        while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
            a = cartpole.pi(s)
            new_s, r = cartpole.P_and_R(s, a)
            td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2)
            s = new_s
            count += 1
        td_list.append(0)
    print('square td = ', np.mean(np.array(td_list)))
Example #8
0
def train():
    game = CartPole()
    actions = game.legal_actions
    dqn = DQN(actions)
    sess = tf.InteractiveSession()
    sess.run(tf.initialize_all_variables())
    saver = tf.train.Saver()
    state = game.newGame()
    for episode in range(T):
        action = dqn.selectAction(state)
        actionNum = np.argmax(action)
        game.env.render()
        next_state, reward, game_over = game.next(actionNum)
        if game_over:
            dqn.storeExperience(state, action, 0, next_state, game_over)
            next_state = game.newGame()
        else:
            dqn.storeExperience(state, action, reward, next_state, game_over)

        ##TODO: sample a minibatch from the replay buffer
        state_batch = ???
        nextState_batch =???
        action_batch =???
        terminal_batch =???
        reward_batch =???

        y_batch = []
        Q_batch = sess.run(dqn.targetQNet.QValue, feed_dict = {dqn.targetQNet.stateInput: nextState_batch} )
        for i in range(len(minibatch)):
            terminal = terminal_batch[i]
            if terminal:
                y_batch.append(reward_batch[i])
            else:
                ## TODO: add the target to the list of targets for each element in the minibatch using Q update rule
                y_batch.append(???)
        currentQ_batch = sess.run(dqn.currentQNet.QValue,
                                  feed_dict = {dqn.currentQNet.stateInput: state_batch })

        sess.run(dqn.trainStep, feed_dict = {dqn.yInput: y_batch, dqn.actionInput: action_batch, dqn.currentQNet.stateInput: state_batch})
        state = next_state

        if episode % UPDATE_TIME == 0:
            sess.run(dqn.copyCurrentToTargetOperation())

        if episode % 25000 == 0:
            saver.save(sess, 'networks/' + 'dqn', global_step= episode)
        if dqn.epsilon > FINAL_EPSILON:
            ## TODO: decay epsilon which represents the probability of taking a random action
            dqn.epsilon -= 
Example #9
0
def sarsa_cartpole(lr, baseparams, epoch=100, eps=1e-2, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros(
            (1, len(actions) * num_tilings * (tiles_per_tiling**len(s))))

    elif base == 'rbf':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * order**len(s)))

    for x in range(epoch):
        s = cartpole.d_zero()

        # choose a from s using a policy derived from q (e.g., ε-greedy or softmax);
        first_q = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                    actions, eps)
        # pi_s = pe.epsilon_greedy(pe.qw(w, s, order, actions, base), actions, eps)
        a = np.random.choice(actions, 1, p=first_q)[0]

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(
                pe.qw(w, new_s, actions, base, baseparams), actions, eps)
            new_a = np.random.choice(actions, 1, p=pi_temp)[0]

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = pe.qw_ele(w, new_s, new_a, actions, base, baseparams)[0]
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            a = new_a
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w(w, eps, base, baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards
Example #10
0
def td_cp(lrs, f_order):
    d = 4

    alpha_result = []
    cartpole = CartPole()

    print('cartpole ', f_order, ' td')

    # kth order Fourier Basis is defined as:
    for alpha in lrs:
        weight = np.zeros((1, (f_order + 1) ** d))
        # update weight in 100 loops
        print('alpha = ', alpha)
        for x in range(100):
            s = cartpole.d_zero()
            count = 0
            while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
                a = cartpole.pi(s)
                new_s, r = cartpole.P_and_R(s, a)
                weight += alpha * (r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) * dvwdw(weight, s,
                                                                                                      f_order).T
                s = new_s
                count += 1
        # print(weight)

        # calculate td in another 100 loops
        td_list = []
        for x in range(100):
            s = cartpole.d_zero()
            count = 0
            while np.abs(s[0]) < cartpole.edge and np.abs(s[1]) < cartpole.fail_angle and count < 1010:
                a = cartpole.pi(s)
                new_s, r = cartpole.P_and_R(s, a)
                td_list.append((r + vw(weight, new_s, f_order) - vw(weight, s, f_order)) ** 2)
                s = new_s
                count += 1
            td_list.append(0)

        msv = np.mean(np.array(td_list))
        print('square td = ', msv)
        if np.isnan(msv):
            alpha_result.append(1e100)
        else:
            alpha_result.append(msv)

    print('##########################')
    return alpha_result
Example #11
0
def qlearning_cartpole(lr, baseparams, decaylambda, epoch=100, base='fourier'):
    cartpole = CartPole()
    estimated_rewards = np.zeros(epoch)
    actions = cartpole.actions
    w = None

    if base == 'fourier':
        order = baseparams['order']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * (order + 1)**len(s)))

    elif base == 'tile':
        num_tilings, tiles_per_tiling = baseparams['num_tilings'], baseparams[
            'tiles_per_tiling']
        s = cartpole.d_zero()
        w = np.zeros((1, len(actions) * num_tilings))

    for x in range(epoch):
        s = cartpole.d_zero()

        count = 0

        while np.abs(s[0]) < cartpole.edge and np.abs(
                s[1]) < cartpole.fail_angle and count < 1010:
            # Choose a′ from s′ using a policy derived from q;
            pi_temp = pe.epsilon_greedy(pe.qw(w, s, actions, base, baseparams),
                                        actions, decaylambda(x))
            a = np.random.choice(actions, 1, p=pi_temp)[0]

            # Take action a and observe r and s′;
            new_s, r = cartpole.P_and_R(s, a)

            # w += lr * (r + pe.qw_fourier_ele(w, new_s, new_a, order, actions) -
            # pe.qw_fourier_ele(w, s, a, order, actions)) * pe.dqwdw_fourier(s, a, order, actions)
            new_q = np.max(pe.qw(w, new_s, actions, base, baseparams))
            q, dqdw = pe.qw_ele(w, s, a, actions, base, baseparams)
            w += lr * (r + new_q - q) * dqdw

            s = new_s
            count += 1

        epi = CartPoleEpisode(cartpole)
        estimated_rewards[x] = epi.run_with_w_softmax(w, decaylambda(x), base,
                                                      baseparams)
        print('episode: ', x, ', reward: ', estimated_rewards[x])
        # print('episode: ', x, ', w: ', w)

    return estimated_rewards
Example #12
0
from cartpole import CartPole

if __name__ == "__main__":
    cartpole = CartPole()
    cartpole.show()
Example #13
0
        if self.iteration_count >= 200:
            terminal = True
        else:
            terminal = star.terminal(self.model.state)

        reward = star.reward(self.model.state, terminal)
        brain_state = star.state(self.model.state)

        return (brain_state, reward, terminal)


if __name__ == "__main__":
    config = bonsai_ai.Config(sys.argv)
    brain = bonsai_ai.Brain(config)

    model = CartPole()
    sim = CartpoleSimulator(brain, 'CartpoleSimulator', config)
    sim.model = model

    render = None
    if '--render' in sys.argv:
        log.info('rendering')
        from render import Viewer
        render = True
        viewer = Viewer()
        viewer.model = model

    log.info('starting simulation...')
    while sim.run():
        if render:
            viewer.update()
Example #14
0
#!/usr/bin/env python

import random
from cartpole import CartPole
cp = CartPole()
print(cp)

for i in range(20):
    cp.step(random.choice([True, False]))
    print(cp)
Example #15
0
Program: NFQ_EXAMPLE.PY
Date: Thursday, March  1 2012
Description: Test NFQ on my cartpole simulation.
"""

from pybrain.rl.agents import LearningAgent
from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork
from cartpole import CartPole
import numpy as np

module = ActionValueNetwork(4,2)
learner = NFQ()
learner.explorer.epsilon = 0.4
agent = LearningAgent(module, learner)

env = CartPole()
cnt = 0
for i in range(1000):
    
    env.reset()
    print "Episode: %d, Count: %d" % (i,cnt)
    cnt = 0
    while not env.failure():
        agent.integrateObservation(env.observation())
        action = agent.getAction()
        pstate, paction, reward, state = env.move(action)
        cnt += 1
        agent.giveReward(reward)
    agent.learn(1)

Example #16
0
 def __init__(self) -> None:
     self.cartpole = CartPole()
Example #17
0
class CartPoleTraining:
    """ Training cartpole using cross entropy agoritham based on the code from the book
    'Deep Reinforcement Learning Hands-On'
    """
    Episode = namedtuple('Episode', field_names=['reward', 'steps'])
    EpisodeStep = namedtuple('EpisodeStep',
                             field_names=['observation', 'action'])

    def __init__(self) -> None:
        self.cartpole = CartPole()

    def iterate_batches(self, net, batch_size):
        batch = []
        episode_reward = 0.0
        episode_steps = []

        #start the episode
        self.cartpole.episode_start()
        state = self.cartpole.get_state()

        obs = self.cartpole.state_to_gym(state)

        sm = nn.Softmax(dim=1)
        while True:
            obs_v = torch.FloatTensor([obs])
            act_probs_v = sm(net(obs_v))
            act_probs = act_probs_v.data.numpy()[0]
            action = np.random.choice(len(act_probs), p=act_probs)

            bonsai_action = self.cartpole.gym_to_action(action)

            self.cartpole.episode_step(bonsai_action)

            is_done = self.cartpole.halted()
            reward = self.cartpole.get_last_reward()
            next_obs = self.cartpole.state_to_gym(self.cartpole.get_state())

            episode_reward += reward
            step = self.EpisodeStep(observation=obs, action=action)
            episode_steps.append(step)
            if is_done:
                e = self.Episode(reward=episode_reward, steps=episode_steps)
                batch.append(e)
                episode_reward = 0.0
                episode_steps = []
                self.cartpole.episode_finish("")

                self.cartpole.episode_start()
                state = self.cartpole.get_state()

                next_obs = self.cartpole.state_to_gym(state)

                if len(batch) == batch_size:
                    yield batch
                    batch = []
            obs = next_obs

    def filter_batch(self, batch, percentile):
        rewards = list(map(lambda s: s.reward, batch))
        reward_bound = np.percentile(rewards, percentile)
        reward_mean = float(np.mean(rewards))

        train_obs = []
        train_act = []
        for reward, steps in batch:
            if reward < reward_bound:
                continue
            train_obs.extend(map(lambda step: step.observation, steps))
            train_act.extend(map(lambda step: step.action, steps))

        train_obs_v = torch.FloatTensor(train_obs)
        train_act_v = torch.LongTensor(train_act)
        return train_obs_v, train_act_v, reward_bound, reward_mean

    def train(self):
        obs_size = self.cartpole._env.unwrapped.observation_space.shape[0]
        n_actions = self.cartpole._env.unwrapped.action_space.n

        net = Net(obs_size, HIDDEN_SIZE, n_actions)
        objective = nn.CrossEntropyLoss()
        optimizer = optim.Adam(params=net.parameters(), lr=0.01)
        writer = SummaryWriter(comment="-cartpole")

        for iter_no, batch in enumerate(self.iterate_batches(net, BATCH_SIZE)):
            obs_v, acts_v, reward_b, reward_m = self.filter_batch(
                batch, PERCENTILE)
            optimizer.zero_grad()
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()

            #env.render()

            print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" %
                  (iter_no, loss_v.item(), reward_m, reward_b))
            writer.add_scalar("loss", loss_v.item(), iter_no)
            writer.add_scalar("reward_bound", reward_b, iter_no)
            writer.add_scalar("reward_mean", reward_m, iter_no)
            if reward_m > 199:
                print("Solved!")
                break
        writer.close()
Example #18
0
            action_scores_v = net(obs_v)
            loss_v = objective(action_scores_v, acts_v)
            loss_v.backward()
            optimizer.step()

            #env.render()

            print("%d: loss=%.3f, reward_mean=%.1f, rw_bound=%.1f" %
                  (iter_no, loss_v.item(), reward_m, reward_b))
            writer.add_scalar("loss", loss_v.item(), iter_no)
            writer.add_scalar("reward_bound", reward_b, iter_no)
            writer.add_scalar("reward_mean", reward_m, iter_no)
            if reward_m > 199:
                print("Solved!")
                break
        writer.close()


if __name__ == '__main__':
    logging.basicConfig()
    log = logging.getLogger("cartpole")
    log.setLevel(level='INFO')

    cross_entropy_agent = CartPoleTraining()
    cross_entropy_agent.train()

    #TODO  save the model after training and load it in agent

    # we will use our environment (wrapper of OpenAI env)
    cartpole = CartPole()
Example #19
0
    def __init__(self, cartpole: CartPole):
        self.cartpole = cartpole

    def act(self, state):
        return cartpole.gym_to_action(cartpole._env.action_space.sample())


if __name__ == '__main__':
    logging.basicConfig()
    log = logging.getLogger("cartpole")
    log.setLevel(level='INFO')

    writer = SummaryWriter()
    # we will use our environment (wrapper of OpenAI env)
    cartpole = CartPole()

    # specify which agent you want to use,
    # BonsaiAgent that uses trained Brain or
    # RandomAgent that randomly selects next action
    agent = BonsaiAgent()

    episode_count = 100

    try:
        for i in range(episode_count):
            #start a new episode and get the new state
            cartpole.episode_start()
            state = cartpole.get_state()
            cum_reward = 0
Example #20
0
from alpha_agent import AlphaZero
from cartpole import CartPole

# create an env_creator function
env_creator = lambda: CartPole()

# define the config with the hyper-parameters
config = {
    'buffer_size': 1000,
    'batch_size': 256,
    'lr': 1e-3,
    'gamma': 0.997,
    'n_steps': 10,
    'num_epochs': 100,
    'num_episodes_per_epoch': 5,
    'learning_starts': 500,  # number of timesteps to sample before SGD
    'value_loss_coefficient': 0.2,
    'model_config': {
        'value_support_min_val': 0,
        'value_support_max_val': 30,
        'num_hidden': 32,
    },
    'mcts_config': {
        'num_simulations': 20,
        "temperature": 1.0,
        "c1_coefficient": 1.25,
        "c2_coefficient": 19652,
        'add_dirichlet_noise': True,
        'dir_noise': 0.5,
        'dir_epsilon': 0.2,
    }
Example #21
0
Author: Jeremy M. Stober
Program: TD_EXAMPLE.PY
Date: Friday, February 24 2012
Description: Examples using TD algorithms to learn value functions.
"""

from gridworld.boyan import Boyan
from gridworld.chainwalk import Chainwalk
from cartpole import CartPole
from td import TD, TDQ, TDQCmac, SarsaCmac, Sarsa, ActorCritic, ActorCriticCmac

# a simple environment
env = Boyan()
learner = TD(13, 0.1, 1.0, 0.8)
learner.learn(1000, env, env.random_policy)
print learner.V

env = Chainwalk()
learnerq = TDQ(2, 4, 0.1, 0.9, 0.8)

import pdb

env = CartPole()
#learnerq = SarsaCmac(2,0.01,0.95,0.9,0.01)
#learnerq = Sarsa(2,170,0.001,0.95,0.5,0.01)
#learnerq = ActorCritic(2, 162, 0.5, 0.5, 0.95, 0.8, 0.9) # From an old Sutton paper -- seems to work quite well.
learnerq = ActorCriticCmac(
    2, 0.5, 1.0, 0.95, 0.8, 0.9
)  # Clearly does some learning, but not nearly as well. Policy not as stable.
learnerq.learn(1000, env)