Ejemplo n.º 1
0
 def performAction(self, action):
     self.t += 1
     self.actions_sequence.append(action[0][0])
     predict_input = concatenate([theano_form(self.actions_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 1)),
                                  theano_form(self.sensors_sequence.data, shape=(N_CBATCH, N_CTIME_STEPS, 4))], axis=2)
     prediction = self.prediction(predict_input)
     self.sensors = prediction[0][-1][1::]
     print "sensors", self.sensors
     raw_input()
     self.sensors_sequence.append(self.sensors)
     self.reward = prediction[0][-1][0]
Ejemplo n.º 2
0
    def train(self):
        print "sending"
        # first send n_time_steps information to the client
        self.setting.serial.send_int(self.setting.n_time_steps)
        print "sent"
        self.cost = [0] * self.setting.n_iterations
        for n in xrange(self.setting.n_iterations):
            signal = self.setting.serial.receive()
            epoch_data = signal.split(',')  # rm1 is reward of last time step
            self.ring_buffer.append(epoch_data)
            buffered_data = self.ring_buffer.get()
            if None not in buffered_data:
                all_data = D.theano_form(list=buffered_data,
                                         shape=[
                                             self.setting.n_batches,
                                             self.setting.n_time_steps + 1,
                                             self.setting.n_trans
                                         ])

                actor_train_inputs = all_data[:, 0:self.setting.n_time_steps,
                                              1::]
                # Predict action of actor model
                action_predict = self.mba.predict(actor_train_inputs)
                critic_train_inputs = np.dstack(
                    (action_predict, actor_train_inputs))
                critic_train_outputs = all_data[:,
                                                1::,  # extract reward from 1 to N_TIME_STEPS,
                                                0].reshape([
                                                    self.setting.n_batches,
                                                    self.setting.n_time_steps,
                                                    self.setting.
                                                    n_output_features
                                                ])  # Reward takes the first position
Ejemplo n.º 3
0
def one_iteration(task, all_params):
    """
    Give current value of weights, output all rewards
    :return:
    """
    rewards = []
    observations = []
    actions = []
    _all_params = lasagne.layers.get_all_params(l_action_formed)
    _all_params[0].set_value(theano_form(all_params, shape=(4, 1)))
    task.reset()
    while not task.isFinished():
        obs = task.getObservation()
        observations.append(obs)
        states = theano_form(obs, shape=[N_BATCH, 1, N_INPUT_FEATURES - 1
                                         ])  # this is for each time step
        model_action_result = action_prediction(states)
        actions.append(model_action_result.reshape(1))
        task.performAction(model_action_result)
        rewards.append(task.getReward())
    last_obs = task.getObservation()
    return rewards, actions, observations, last_obs, sum(rewards)
Ejemplo n.º 4
0
def one_sim_iteration(task, all_params):
    """
    This function estimates the reward by
    RNN function. in our case, it is LSTM
    """

    rewards = []
    observations = []
    actions = []
    _all_params = lasagne.layers.get_all_params(l_action_formed)
    _all_params[0].set_value(theano_form(all_params, shape=(4, 1)))

    while not task.isFinished():
        obs = task.getObservation()
        observations.append(obs)
        states = theano_form(obs, shape=[N_BATCH, 1, N_INPUT_FEATURES - 1
                                         ])  # this is for each time step
        model_action_result = action_prediction(states)
        actions.append(model_action_result.reshape(1))
        task.performAction(model_action_result)
        rewards.append(task.getReward())
    last_obs = task.getObservation()
    return rewards, actions, observations, last_obs, sum(rewards)
Ejemplo n.º 5
0
    def train(self):
        self.build_functions()
        print "sending"
        # first send n_time_steps information to the client
        self.setting.serial.send_int(self.setting.n_time_steps)
        print "sent"
        self.costs = [0] * self.setting.n_iterations
        for n in xrange(self.setting.n_iterations):
            signal = self.setting.serial.receive()
            epoch_data = signal.split(',')  # rm1 is reward of last time step
            self.ring_buffer.append(epoch_data)
            buffered_data = self.ring_buffer.get()
            if None not in buffered_data:
                all_data = D.theano_form(list=buffered_data,
                                         shape=[
                                             self.setting.n_batches,
                                             self.setting.n_time_steps + 1,
                                             self.setting.n_trans
                                         ])

                train_inputs = all_data[:, 0:self.setting.n_time_steps, 1::]

                # Set desired output, the second number of result is reward
                train_outputs = all_data[:,
                                         1::,  # extract reward from 1 to N_TIME_STEPS,
                                         0  # reward is the first element in this structure
                                         ].reshape([
                                             self.setting.n_batches,
                                             self.setting.n_time_steps,
                                             self.setting.n_output_features
                                         ])  # Reward takes the first position
                self.costs[n] = self._train(train_inputs, train_outputs)
                # Extract the most recent action from all result.
                action = self.get_binomial_action(
                    self.pred_action(train_inputs)[:, -1]) * 2 - 1

                self.setting.serial.send_int(action)
                if not n % 10:
                    cost_val = self.compute_cost(train_inputs, train_outputs)
                    model_reward_result = self.predict(train_inputs)
                    print "Iteration {} validation cost = {}".format(
                        n, cost_val)
                    print "reward predict: "
                    print model_reward_result
                    print "train results:"
                    print train_outputs
                    print "predcted action"
                    print self.pred_action(train_inputs)[:, -1]
Ejemplo n.º 6
0
    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)

    sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200)

    all_params = lasagne.layers.get_all_params(l_action_formed)

    records = []
    for time in xrange(50):
        records.append([])
        _all_params = lasagne.layers.get_all_params(l_action_formed)
        _all_params[0].set_value(
            theano_form(uniform(-0.1, 0.1, 4), shape=(4, 1)))

        baseline = None
        num_parameters = 4  # five parameters
        init_sigma = 3  # initial number sigma
        sigmas = ones(num_parameters) * init_sigma
        best_reward = -1000
        current = all_params[0].get_value()[:, 0]
        arg_reward = []

        previous_cost = 10000
        real_world_sample_counts = 0
        thinking_count = 0
        for n in xrange(1500):

            epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
Ejemplo n.º 7
0
def main():



    # create environment
    env = CartPoleEnvironment()
    # create task
    task = BalanceTask(env, 200, desiredValue=None)

    sim_task = SimBalanceTask(prediction=reward_prediction, maxsteps=200)



    all_params = lasagne.layers.get_all_params(l_action_formed)

    records = []
    real_world_sample_counts = []
    for time in xrange(50):
        records.append([])
        _all_params = lasagne.layers.get_all_params(l_action_formed)
        _all_params[0].set_value(theano_form(uniform(-0.1, 0.1, 4), shape=(4,1)))


        baseline = None
        num_parameters = 4 # five parameters
        init_sigma = 3 # initial number sigma
        sigmas = ones(num_parameters) * init_sigma
        best_reward = -1000
        current = all_params[0].get_value()[:, 0]
        arg_reward = []


        previous_cost = 10000
        real_world_sample_count = 0
        thinking_count = 0

        cost_confidence = 2

        for n in xrange(1500):

            epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
            if previous_cost <= cost_confidence:

                rewards1, actions1, observations1, last_obs1, reward1 = one_sim_iteration(sim_task, all_params=current + epsilon)
                rewards2, actions2, observations2, last_obs2, reward2 = one_sim_iteration(sim_task, all_params=current - epsilon)
                thinking_count += 1
                if thinking_count == 2:
                    previous_cost = 10000
                    thinking_count = 0
            else:
                # Perform actions in real environment

                rewards1, actions1, observations1, last_obs1, reward1 = one_iteration(task=task, all_params=current + epsilon)
                real_world_sample_count += 1
                if reward1 > best_reward:
                    best_reward = reward1
                rewards2, actions2, observations2, last_obs2, reward2 = one_iteration(task= task, all_params=current - epsilon)
                real_world_sample_count += 1
                if reward2 > best_reward:
                    best_reward = reward2


                # Prepare for data for first process
                actions1 = theano_form(actions1, shape=(len(actions1), 1))
                observations1 = theano_form(observations1, shape=(len(observations1), 4))
                predicted_obs1 = concatenate([observations1[1::], [last_obs1]])
                input_data1 = concatenate([actions1, observations1], axis=1)
                output_data1 = concatenate([theano_form(rewards1, shape=(len(rewards1), 1)), predicted_obs1], axis=1)

                # Training with data gathered from first process
                critic_train_inputs1 = list(chunks(input_data1, N_CTIME_STEPS))
                critic_train_outputs1 = list(chunks(output_data1, N_CTIME_STEPS))


                # Prepare for data for second process
                actions2 = theano_form(actions2, shape=(len(actions2), 1))
                observations2 = theano_form(observations2, shape=(len(observations2), 4))
                predicted_obs2 = concatenate([observations2[1::], [last_obs2]])
                input_data2 = concatenate([actions2, observations2], axis=1)
                output_data2 = concatenate([theano_form(rewards2, shape=(len(rewards2), 1)), predicted_obs2], axis=1)

                # Training with data gathered from second process
                critic_train_inputs2 = list(chunks(input_data2, N_CTIME_STEPS))
                critic_train_outputs2 = list(chunks(output_data2, N_CTIME_STEPS))



                train_base_line = (700 - n*6)/2 if (700 - n*6)/2 > cost_confidence else cost_confidence

                count1 = 0
                while True:
                    count1 += 1
                    costs1 = []
                    for input, output in zip(critic_train_inputs1, critic_train_outputs1):
                        critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES))
                        critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES))
                        costs1.append(train(critic_train_input, critic_train_output))
                    if mean(costs1) < train_base_line:
                        break
                    else:
                        if not count1%50:
                            print mean(costs1)
                        #print "mean cost 1: ", mean(costs1), "baseline :", train_base_line
                    if count1 > 1:
                        break


                count2 = 0
                while True:
                    count2 += 1
                    costs2 = []
                    for input, output in zip(critic_train_inputs2, critic_train_outputs2):
                        critic_train_input = theano_form(input, shape=(N_CBATCH, N_CTIME_STEPS, N_CINPUT_FEATURES))
                        critic_train_output = theano_form(output, shape=(N_CBATCH, N_CTIME_STEPS, N_OUTPUT_FEATURES))
                        costs2.append(train(critic_train_input, critic_train_output))

                    if mean(costs2) < train_base_line:
                        break
                    else:
                        if not count2%50:
                            print mean(costs2)

                        #print "mean cost2: ", mean(costs2), "baseline :", train_base_line

                    if count2 > 1:
                        break

                previous_cost = sum(costs1) + sum(costs2)


            mreward = (reward1 + reward2) / 2.

            if baseline is None:
                # first learning step
                baseline = mreward
                fakt = 0.
                fakt2 = 0.
            else:
                #calc the gradients
                if reward1 != reward2:
                    #gradient estimate alla SPSA but with likelihood gradient and normalization
                    fakt = (reward1 - reward2) / (2. * best_reward - reward1 - reward2)
                else:
                    fakt=0.
                #normalized sigma gradient with moving average baseline
                norm = (best_reward - baseline)
                if norm != 0.0:
                    fakt2=(mreward-baseline)/(best_reward-baseline)
                else:
                    fakt2 = 0.0
            #update baseline
            baseline = 0.9 * baseline + 0.1 * mreward
            # update parameters and sigmas
            current = current + LEARNING_RATE * fakt * epsilon

            if fakt2 > 0: #for sigma adaption alg. follows only positive gradients
                #apply sigma update locally
                sigmas = sigmas + LEARNING_RATE * fakt2 * (epsilon * epsilon - sigmas * sigmas) / sigmas


            # Test set
            epsilon, epsilon_star = sample_parameter(sigmas=sigmas)
            _, _, _, _, test_reward1 = one_iteration(task=task, all_params=current + epsilon)
            _, _, _, _, test_reward2 = one_iteration(task=task, all_params=current - epsilon)
            test_mreward = (test_reward1 + test_reward2)/ 2.0
            arg_reward.append(test_mreward)

            print n


            if not n%10:
                print "test_reward 1:", test_reward1
                _, _, _, _, sim_test_reward1 = one_sim_iteration(task=sim_task, all_params=current + epsilon)
                print "simulated reward 1:", sim_test_reward1
                print "test_reward 2:", test_reward2
                _, _, _, _, sim_test_reward2 = one_sim_iteration(task=sim_task, all_params=current - epsilon)
                print "simulated reward 2:", sim_test_reward2


                print "previous_cost :", previous_cost
                print "real_word_example :", real_world_sample_count
                temp_arg = sum(arg_reward)/len(arg_reward)
                records[time].append([real_world_sample_count, temp_arg])
                print "best reward:", best_reward, "average reward:", temp_arg
                print
                arg_reward = []
        real_world_sample_counts.append(real_world_sample_count)
    #print records
    pickle.dump(records, open("records_lambda_mu.p", "wb"))
    pickle.dump(real_world_sample_counts, open("real_world_sample_counts_mu.p", "wb"))