from pybrain.rl.environments.cartpole import CartPoleEnvironment, DiscreteBalanceTask, CartPoleRenderer from pybrain.rl.agents import LearningAgent from pybrain.rl.experiments import EpisodicExperiment from pybrain.rl.learners.valuebased import NFQ, ActionValueNetwork from pybrain.rl.explorers import BoltzmannExplorer from numpy import array, arange, meshgrid, pi, zeros, mean from matplotlib import pyplot as plt # switch this to True if you want to see the cart balancing the pole (slower) render = False plt.ion() env = CartPoleEnvironment() if render: renderer = CartPoleRenderer() env.setRenderer(renderer) renderer.start() module = ActionValueNetwork(4, 3) task = DiscreteBalanceTask(env, 100) learner = NFQ() learner.explorer.epsilon = 0.4 agent = LearningAgent(module, learner) testagent = LearningAgent(module, None) experiment = EpisodicExperiment(task, agent)
def main(): # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200) all_params = lasagne.layers.get_all_params(l_action_formed) records = [] real_world_sample_counts = [] for time in xrange(50): records.append([]) _all_params = lasagne.layers.get_all_params(l_action_formed) _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1))) baseline = None num_parameters = 4 # five parameters init_sigma = 3 # initial number sigma sigmas = ones(num_parameters) * init_sigma best_reward = -1000 current = all_params[0].get_value()[:, 0] arg_reward = [] previous_cost = 10000 real_world_sample_count = 0 thinking_count = 0 cost_confidence = 2 for n in xrange(1500): epsilon, epsilon_star = sample_parameter(sigmas=sigmas) if previous_cost <= cost_confidence: rewards1, actions1, observations1, last_obs1, reward1 = one_sim_iteration(sim_task, all_params=current + epsilon) rewards2, actions2, observations2, last_obs2, reward2 = one_sim_iteration(sim_task, all_params=current - epsilon) thinking_count += 1 if thinking_count == 2: previous_cost = 10000 thinking_count = 0 else: # Perform actions in real environment rewards1, actions1, observations1, last_obs1, reward1 = one_iteration(task=task, all_params=current + epsilon) real_world_sample_count += 1 if reward1 > best_reward: best_reward = reward1 rewards2, actions2, observations2, last_obs2, reward2 = one_iteration(task= task, all_params=current - epsilon) real_world_sample_count += 1 if reward2 > best_reward: best_reward = reward2 # Prepare for data for first process actions1 = theano_form(actions1, shape=(len(actions1), 1)) observations1 = theano_form(observations1, shape=(len(observations1), 4)) predicted_obs1 = concatenate([observations1[1::], [last_obs1]]) input_data1 = concatenate([actions1, observations1], axis=1) output_data1 = concatenate([theano_form(rewards1, shape=(len(rewards1), 1)), predicted_obs1], axis=1) # Training with data gathered from first process critic_train_inputs1 = list(chunks(input_data1, N_CTIME_STEPS)) critic_train_outputs1 = list(chunks(output_data1, N_CTIME_STEPS)) # Prepare for data for second process actions2 = theano_form(actions2, shape=(len(actions2), 1)) observations2 = theano_form(observations2, shape=(len(observations2), 4)) predicted_obs2 = concatenate([observations2[1::], [last_obs2]]) input_data2 = concatenate([actions2, observations2], axis=1) output_data2 = concatenate([theano_form(rewards2, shape=(len(rewards2), 1)), predicted_obs2], axis=1) # Training with data gathered from second process critic_train_inputs2 = list(chunks(input_data2, N_CTIME_STEPS)) critic_train_outputs2 = list(chunks(output_data2, N_CTIME_STEPS)) train_base_line = (700 - n*6)/2 if (700 - n*6)/2 > cost_confidence else cost_confidence count1 = 0 while True: count1 += 1 costs1 = [] for input, output in zip(critic_train_inputs1, critic_train_outputs1): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs1.append(train(critic_train_input, critic_train_output)) if mean(costs1) < train_base_line: break else: if not count1%50: print mean(costs1) #print "mean cost 1: ", mean(costs1), "baseline :", train_base_line if count1 > 1: break count2 = 0 while True: count2 += 1 costs2 = [] for input, output in zip(critic_train_inputs2, critic_train_outputs2): critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES)) critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES)) costs2.append(train(critic_train_input, critic_train_output)) if mean(costs2) < train_base_line: break else: if not count2%50: print mean(costs2) #print "mean cost2: ", mean(costs2), "baseline :", train_base_line if count2 > 1: break previous_cost = sum(costs1) + sum(costs2) mreward = (reward1 + reward2) / 2. if baseline is None: # first learning step baseline = mreward fakt = 0. fakt2 = 0. else: #calc the gradients if reward1 != reward2: #gradient estimate alla SPSA but with likelihood gradient and normalization fakt = (reward1 - reward2) / (2. * best_reward - reward1 - reward2) else: fakt=0. #normalized sigma gradient with moving average baseline norm = (best_reward - baseline) if norm != 0.0: fakt2=(mreward-baseline)/(best_reward-baseline) else: fakt2 = 0.0 #update baseline baseline = 0.9 * baseline + 0.1 * mreward # update parameters and sigmas current = current + LEARNING_RATE * fakt * epsilon if fakt2 > 0: #for sigma adaption alg. follows only positive gradients #apply sigma update locally sigmas = sigmas + LEARNING_RATE * fakt2 * (epsilon * epsilon - sigmas * sigmas) / sigmas # Test set epsilon, epsilon_star = sample_parameter(sigmas=sigmas) _, _, _, _, test_reward1 = one_iteration(task=task, all_params=current + epsilon) _, _, _, _, test_reward2 = one_iteration(task=task, all_params=current - epsilon) test_mreward = (test_reward1 + test_reward2)/ 2.0 arg_reward.append(test_mreward) print n if not n%10: print "test_reward 1:", test_reward1 _, _, _, _, sim_test_reward1 = one_sim_iteration(task=sim_task, all_params=current + epsilon) print "simulated reward 1:", sim_test_reward1 print "test_reward 2:", test_reward2 _, _, _, _, sim_test_reward2 = one_sim_iteration(task=sim_task, all_params=current - epsilon) print "simulated reward 2:", sim_test_reward2 print "previous_cost :", previous_cost print "real_word_example :", real_world_sample_count temp_arg = sum(arg_reward)/len(arg_reward) records[time].append([real_world_sample_count, temp_arg]) print "best reward:", best_reward, "average reward:", temp_arg print arg_reward = [] real_world_sample_counts.append(real_world_sample_count) #print records pickle.dump(records, open("records_lambda_mu.p", "wb")) pickle.dump(real_world_sample_counts, open("real_world_sample_counts_mu.p", "wb"))