Example #1
0
File: nfq.py Project: wsgan001/AI
from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask, CartPoleRenderer
from pybrain.rl.agents import LearningAgent
from pybrain.rl.experiments import EpisodicExperiment
from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork
from pybrain.rl.explorers import BoltzmannExplorer

from numpy import array, arange, meshgrid, pi, zeros, mean
from matplotlib import pyplot as plt

# switch this to True if you want to see the cart balancing the pole (slower)
render = False

plt.ion()

env = CartPoleEnvironment()
if render:
    renderer = CartPoleRenderer()
    env.setRenderer(renderer)
    renderer.start()

module = ActionValueNetwork(4, 3)

task = DiscreteBalanceTask(env, 100)
learner = NFQ()
learner.explorer.epsilon = 0.4

agent = LearningAgent(module, learner)
testagent = LearningAgent(module, None)
experiment = EpisodicExperiment(task, agent)
Example #2
0
def main():



    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)

    sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200)



    all_params = lasagne.layers.get_all_params(l_action_formed)

    records = []
    real_world_sample_counts = []
    for time in xrange(50):
        records.append([])
        _all_params = lasagne.layers.get_all_params(l_action_formed)
        _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1)))


        baseline = None
        num_parameters = 4 # five parameters
        init_sigma = 3 # initial number sigma
        sigmas = ones(num_parameters) * init_sigma
        best_reward = -1000
        current = all_params[0].get_value()[:, 0]
        arg_reward = []


        previous_cost = 10000
        real_world_sample_count = 0
        thinking_count = 0

        cost_confidence = 2

        for n in xrange(1500):

            epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
            if previous_cost <= cost_confidence:

                rewards1, actions1, observations1, last_obs1, reward1 = one_sim_iteration(sim_task, all_params=current + epsilon)
                rewards2, actions2, observations2, last_obs2, reward2 = one_sim_iteration(sim_task, all_params=current - epsilon)
                thinking_count += 1
                if thinking_count == 2:
                    previous_cost = 10000
                    thinking_count = 0
            else:
                # Perform actions in real environment

                rewards1, actions1, observations1, last_obs1, reward1 = one_iteration(task=task, all_params=current + epsilon)
                real_world_sample_count += 1
                if reward1 > best_reward:
                    best_reward = reward1
                rewards2, actions2, observations2, last_obs2, reward2 = one_iteration(task= task, all_params=current - epsilon)
                real_world_sample_count += 1
                if reward2 > best_reward:
                    best_reward = reward2


                # Prepare for data for first process
                actions1 = theano_form(actions1, shape=(len(actions1), 1))
                observations1 = theano_form(observations1, shape=(len(observations1), 4))
                predicted_obs1 = concatenate([observations1[1::], [last_obs1]])
                input_data1 = concatenate([actions1, observations1], axis=1)
                output_data1 = concatenate([theano_form(rewards1, shape=(len(rewards1), 1)), predicted_obs1], axis=1)

                # Training with data gathered from first process
                critic_train_inputs1 = list(chunks(input_data1, N_CTIME_STEPS))
                critic_train_outputs1 = list(chunks(output_data1, N_CTIME_STEPS))


                # Prepare for data for second process
                actions2 = theano_form(actions2, shape=(len(actions2), 1))
                observations2 = theano_form(observations2, shape=(len(observations2), 4))
                predicted_obs2 = concatenate([observations2[1::], [last_obs2]])
                input_data2 = concatenate([actions2, observations2], axis=1)
                output_data2 = concatenate([theano_form(rewards2, shape=(len(rewards2), 1)), predicted_obs2], axis=1)

                # Training with data gathered from second process
                critic_train_inputs2 = list(chunks(input_data2, N_CTIME_STEPS))
                critic_train_outputs2 = list(chunks(output_data2, N_CTIME_STEPS))



                train_base_line = (700 - n*6)/2 if (700 - n*6)/2 > cost_confidence else cost_confidence

                count1 = 0
                while True:
                    count1 += 1
                    costs1 = []
                    for input, output in zip(critic_train_inputs1, critic_train_outputs1):
                        critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES))
                        critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES))
                        costs1.append(train(critic_train_input, critic_train_output))
                    if mean(costs1) < train_base_line:
                        break
                    else:
                        if not count1%50:
                            print mean(costs1)
                        #print "mean cost 1: ", mean(costs1), "baseline :", train_base_line
                    if count1 > 1:
                        break


                count2 = 0
                while True:
                    count2 += 1
                    costs2 = []
                    for input, output in zip(critic_train_inputs2, critic_train_outputs2):
                        critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES))
                        critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES))
                        costs2.append(train(critic_train_input, critic_train_output))

                    if mean(costs2) < train_base_line:
                        break
                    else:
                        if not count2%50:
                            print mean(costs2)

                        #print "mean cost2: ", mean(costs2), "baseline :", train_base_line

                    if count2 > 1:
                        break

                previous_cost = sum(costs1) + sum(costs2)


            mreward = (reward1 + reward2) / 2.

            if baseline is None:
                # first learning step
                baseline = mreward
                fakt = 0.
                fakt2 = 0.
            else:
                #calc the gradients
                if reward1 != reward2:
                    #gradient estimate alla SPSA but with likelihood gradient and normalization
                    fakt = (reward1 - reward2) / (2. * best_reward - reward1 - reward2)
                else:
                    fakt=0.
                #normalized sigma gradient with moving average baseline
                norm = (best_reward - baseline)
                if norm != 0.0:
                    fakt2=(mreward-baseline)/(best_reward-baseline)
                else:
                    fakt2 = 0.0
            #update baseline
            baseline = 0.9 * baseline + 0.1 * mreward
            # update parameters and sigmas
            current = current + LEARNING_RATE * fakt * epsilon

            if fakt2 > 0: #for sigma adaption alg. follows only positive gradients
                #apply sigma update locally
                sigmas = sigmas + LEARNING_RATE * fakt2 * (epsilon * epsilon - sigmas * sigmas) / sigmas


            # Test set
            epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
            _, _, _, _, test_reward1 = one_iteration(task=task, all_params=current + epsilon)
            _, _, _, _, test_reward2 = one_iteration(task=task, all_params=current - epsilon)
            test_mreward = (test_reward1 + test_reward2)/ 2.0
            arg_reward.append(test_mreward)

            print n


            if not n%10:
                print "test_reward 1:", test_reward1
                _, _, _, _, sim_test_reward1 = one_sim_iteration(task=sim_task, all_params=current + epsilon)
                print "simulated reward 1:", sim_test_reward1
                print "test_reward 2:", test_reward2
                _, _, _, _, sim_test_reward2 = one_sim_iteration(task=sim_task, all_params=current - epsilon)
                print "simulated reward 2:", sim_test_reward2


                print "previous_cost :", previous_cost
                print "real_word_example :", real_world_sample_count
                temp_arg = sum(arg_reward)/len(arg_reward)
                records[time].append([real_world_sample_count, temp_arg])
                print "best reward:", best_reward, "average reward:", temp_arg
                print
                arg_reward = []
        real_world_sample_counts.append(real_world_sample_count)
    #print records
    pickle.dump(records, open("records_lambda_mu.p", "wb"))
    pickle.dump(real_world_sample_counts, open("real_world_sample_counts_mu.p", "wb"))