def __init__(self, prediction, maxsteps): super(SimBalanceTask, self).__init__(None) self.prediction = prediction self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4) self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0]) self.sensors = self.sensors_sequence.data[-1] self.t = 0 self.N = maxsteps
def reset(self): if self.randomInitialization: angle = random.uniform(-0.2, 0.2) pos = random.uniform(-0.5, 0.5) else: angle = -0.2 pos = 0.2 self.t = 0 self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4) self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0]) self.sensors = (angle, 0.0, pos, 0.0) self.sensors_sequence.append(self.sensors)
class SimBalanceTask(EpisodicTask): randomInitialization = True def __init__(self, prediction, maxsteps): super(SimBalanceTask, self).__init__(None) self.prediction = prediction self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4) self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0]) self.sensors = self.sensors_sequence.data[-1] self.t = 0 self.N = maxsteps def performAction(self, action): self.t += 1 self.actions_sequence.append(action[0][0]) predict_input = concatenate([theano_form(self.actions_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 1)), theano_form(self.sensors_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 4))], axis=2) prediction = self.prediction(predict_input) self.sensors = prediction[0][-1][1::] print "sensors", self.sensors raw_input() self.sensors_sequence.append(self.sensors) self.reward = prediction[0][-1][0] def getObservation(self): return array(self.sensors) def getPoleAngles(self): return self.sensors[0] def getCartPosition(self): return self.sensors[2] def isFinished(self): if abs(self.getPoleAngles())> 0.7: # pole has fallen return True elif abs(self.getCartPosition()) > 2.4: # cart is out of it's border conditions return True elif self.t >= self.N: # maximal timesteps return True return False def reset(self): if self.randomInitialization: angle = random.uniform(-0.2, 0.2) pos = random.uniform(-0.5, 0.5) else: angle = -0.2 pos = 0.2 self.t = 0 self.sensors_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0] * 4) self.actions_sequence = RingBuffer(N_CTIME_STEPS, ivalue=[0.0]) self.sensors = [angle, 0.0, pos, 0.0] self.sensors_sequence.append(self.sensors) def getReward(self): return self.reward
# Theano Functions for Critic Network, critic_train = theano.function([critic_input, critic_output], critic_cost, updates=critic_updates) # Predict Action critic_prediction = theano.function( [critic_input], l_reward_formed.get_output(critic_input)) # Compute the cost critic_cost = theano.function([critic_input, critic_output], critic_cost) # Record all costs of the Actor Network. critic_costs = np.zeros(N_ITERATIONS) # Initialize serial communication class serial = SocketServer() ring_buffer = RingBuffer(size=N_TIME_STEPS + 1) # need reward of next step for training actions_set = RingBuffer(size=N_TIME_STEPS) actions_set.data = binomial(1, 0.5, N_TIME_STEPS).astype( theano.config.floatX).tolist() iter_init_actions = iter(actions_set.data) costs = [0] * N_ITERATIONS # Send n_time_steps information to client serial.send("%d\0" % N_TIME_STEPS) # Form forget vector forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)]) for n in range(N_ITERATIONS): if None in ring_buffer.get(): signal = serial.receive()
#y_pred_action = theano.function([input], l_action_formed.get_output(input)) reward_prediction = theano.function([input], l_reward_formed.get_output(input)) # Predict Action action_prediction = theano.function([input], l_action_formed.get_output(input)) # Compute the cost compute_cost = theano.function([input, target_output], cost) # Training the network costs = np.zeros(N_ITERATIONS) # Initialize serial communication class serial = SocketServer() ring_buffer = RingBuffer(size=N_TIME_STEPS + 1) # need reward of next step for training # Send n_time_steps information to client serial.send("%i\0" % N_TIME_STEPS) # Form forget vector forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)]) for n in range(N_ITERATIONS): signal = serial.receive() epoch_data = signal.split(',') # rm1 is reward of last time step ring_buffer.append(epoch_data) buffered_data = ring_buffer.get() if None not in buffered_data: all_data = theano_form(list=buffered_data, shape=[N_BATCH, N_TIME_STEPS+1, N_TRANS])
reward_prediction = theano.function([input], l_reward_formed.get_output(input)) # Predict Action action_prediction = theano.function([input], l_action_formed.get_output(input)) # Compute the cost compute_cost = theano.function([input, target_output], cost) # Training the network costs = np.zeros(N_ITERATIONS) # Initialize serial communication class serial = SocketServer() ring_buffer = RingBuffer(size=N_TIME_STEPS + 1) # need reward of next step for training # Form forget vector forget_vector = array([FORGET_RATE**i for i in xrange(N_TIME_STEPS)]) # create environment env = CartPoleEnvironment() # create task task = BalanceTask(env, 200, desiredValue=None) # Cost = mean squared error, starting from delay point cost = T.mean((l_action_formed.get_output(input)[:, :, :] - target_output[:, :, :])**2) unfolding_time = 10 for n in range(N_ITERATIONS):