Ejemplo n.º 1
0
def tamer_algorithm():
    weights_file_str = 'weights/weights_{}.hdf5'

    puddy = PUDDLER()
    init_state = puddy.get_initial_state()
    all_actions = puddy.get_possible_actions()
    current_act_ind = randint(0, len(all_actions) - 1)

    EPISODE_LIMIT = 40
    step_count = 0
    episode_number = 0
    actions_models, actions_X_train, actions_y_train, aux_X_train, aux_y_train \
        = get_action_models_and_training_sets(all_actions)

    current_state = puddy.get_next_state(init_state,
                                         all_actions[current_act_ind])
    # current_act_ind = epsilon_greedy(current_state, actions_models, all_actions, explanation_features)

    start_time = time.time()

    batch_size = 250
    num_iters = 100000
    number_of_no_exp = 0
    number_of_exp = 0
    for i in range(num_iters):
        prev_best_action = all_actions[current_act_ind]
        model_name = 'nn_model_{}'.format(prev_best_action)
        X_train_name = 'X_train_{}'.format(prev_best_action)
        y_train_name = 'y_train_{}'.format(prev_best_action)

        explanation_features = choose_random_expln_features()
        if explanation_features[0] > 0.5:
            number_of_exp += 1
        else:
            number_of_no_exp += 1
        print(explanation_features)
        step_count += 1
        # Get the human reward:
        h = puddy.get_human_reinf_from_prev_step(current_state,
                                                 all_actions[current_act_ind],
                                                 explanation_features)
        aux_y_train[y_train_name].append(h)

        print("prev_best_action", current_state, prev_best_action, h)
        xf = explanation_features
        aux_X_train[X_train_name].append(
            [current_state.x, current_state.y, xf[0], xf[1], xf[2]])
        actions_X_train[X_train_name] = np.array(aux_X_train[X_train_name])
        actions_y_train[y_train_name] = np.array(aux_y_train[y_train_name])

        #explanation_features = choose_random_expln_features()
        #
        #if explanation_features[0] > 0.5:
        #    number_of_exp += 1
        #else:
        #    number_of_no_exp +=1
        #print (explanation_features)
        #step_count += 1
        # Get the human reward:
        #h = puddy.get_human_reinf_from_prev_step(current_state, all_actions[current_act_ind], explanation_features)
        #aux_y_train[y_train_name].append(h)

        #print ("prev_best_action",current_state,prev_best_action,h)
        #xf = explanation_features
        #aux_X_train[X_train_name].append([current_state.x, current_state.y, xf[0]])
        #actions_X_train[X_train_name] = np.array(aux_X_train[X_train_name])
        #actions_y_train[y_train_name] = np.array(aux_y_train[y_train_name])

        # If have a batch of data ready, train and predict from it
        # Update the models if we are on a batch_size iteration
        if i % batch_size == 0:
            for poss_act in all_actions:
                train_weights_file = weights_file_str.format(poss_act)
                train_model_name = 'nn_model_{}'.format(poss_act)
                train_X_name = 'X_train_{}'.format(poss_act)
                train_y_name = 'y_train_{}'.format(poss_act)

                curr_model = actions_models[train_model_name]

                try:
                    curr_model.load_weights(train_weights_file)
                except:
                    pass
                print("----------------------------------")
                print "IN ITERATION {}".format(i)
                print("TRAINING {}".format(poss_act))
                X_train = actions_X_train[train_X_name]
                y_train = actions_y_train[train_y_name]
                if len(X_train) > 0:
                    curr_model.fit(X_train, y_train, nb_epoch=20, batch_size=2)
                    curr_model.save_weights(train_weights_file)
                else:
                    print("actions ", poss_act)

        # Get the next state based on action (random for the moment)
        new_state = puddy.get_next_state(current_state,
                                         all_actions[current_act_ind])

        # This is the predict part
        current_act_ind = epsilon_greedy(new_state, actions_models,
                                         all_actions, explanation_features,
                                         episode_number, step_count)
        # print ("current action", all_actions[current_act_ind])
        current_state = copy.deepcopy(new_state)

        if current_state.is_terminal() or step_count >= EPISODE_LIMIT:
            current_state = puddy.get_initial_state()
            step_count = 0
            episode_number += 1

    elapsed_time = time.time() - start_time
    print("------------------------------------------------------------")
    print(" Elapsed time to train: {}".format(elapsed_time))
    print("------------------------------------------------------------")
    print("No of explanation examples", number_of_exp)
    print("No of no explanation examples", number_of_no_exp)
    # ------------ EVAL --------------  #
    actions_models = load_trained_actions_models(all_actions)
    #    # -------------------------------- #

    #    explanation_features = [0]
    #    # Test the policy
    #    current_state = puddy.get_initial_state()
    #    print("Start from state", current_state)
    #    curr_char = "S"
    #    puddy.visualize_agent(current_state)
    #    curr_char = raw_input("")
    #    while curr_char.lower() != 'n':
    #        a = puddy.get_best_action(current_state, explanation_features)
    #        print("Next action", a)
    #        next_state = puddy.get_next_state(current_state, a)
    #        print("New state", next_state)
    #        current_state = copy.deepcopy(next_state)
    #        puddy.visualize_agent(current_state)
    #        curr_char = raw_input("")
    #
    #
    explanation_features = [0, 0, 0]
    # Test the policy
    current_state = puddy.get_initial_state()
    print("Start from state", current_state)
    curr_char = "S"
    puddy.visualize_agent(current_state)
    curr_char = raw_input("")
    while curr_char.lower() != 'n':
        a = puddy.get_best_action(current_state, explanation_features)
        print("Next action", a)
        next_state = puddy.get_next_state(current_state, a)
        print("New state", next_state)
        current_state = copy.deepcopy(next_state)
        puddy.visualize_agent(current_state)
        curr_char = raw_input("")

    explanation_features = [0, 0, 0]
    current_state = puddy.get_initial_state()
    print(
        "Best action from tamer",
        all_actions[get_best_action(current_state, actions_models, all_actions,
                                    explanation_features)])

    puddy.visualize_agent(current_state)
    curr_char = raw_input("")
    while curr_char.lower() != 'n':
        a = all_actions[get_best_action(current_state, actions_models,
                                        all_actions, explanation_features)]
        print("Next action", a)
        next_state = puddy.get_next_state(current_state, a)
        print("New state", next_state)
        current_state = copy.deepcopy(next_state)
        puddy.visualize_agent(current_state)
        curr_char = raw_input("")

    explanation_features = [0, 0, 1]
    current_state = puddy.get_initial_state()
    print(
        "Best action from tamer",
        all_actions[get_best_action(current_state, actions_models, all_actions,
                                    explanation_features)])

    puddy.visualize_agent(current_state)
    curr_char = raw_input("")
    while curr_char.lower() != 'n':
        a = all_actions[get_best_action(current_state, actions_models,
                                        all_actions, explanation_features)]
        print("Next action", a)
        next_state = puddy.get_next_state(current_state, a)
        print("New state", next_state)
        current_state = copy.deepcopy(next_state)
        puddy.visualize_agent(current_state)
        curr_char = raw_input("")
def tamer_algorithm():

    puddy = PUDDLER()

    init_state = puddy.get_initial_state()
    all_actions = puddy.get_possible_actions()

    explanation_features = []  #[0,0,0]

    current_act_ind = randint(0, len(all_actions) - 1)

    #X_train = np.array([list(init_state.features()) + explanation_features +[current_act_ind]])
    #y_train = np.array([puddy.get_human_reinf_from_prev_step(init_state, all_actions[current_act_ind], explanation_features)])

    # Fit the values from the data
    #reg = SGDRegressor(max_iter=100).fit(X_train, y_train)
    approx_model = LinearFuncApprox(num_features=2, actions=all_actions)
    approx_model.update(
        init_state, all_actions[current_act_ind],
        puddy.get_human_reinf_from_prev_step(init_state,
                                             all_actions[current_act_ind]))

    # s = [x, y, p1, p2, a]
    # Up: 1, Right: 2, Down: 3, Left: 4
    #s = [0.1, 0.1, 1, 0]
    #a = get_best_action(s)

    #s.append(a)
    #np_s = np.array([s])

    current_state = puddy.get_next_state(init_state,
                                         all_actions[current_act_ind])
    current_act_ind = epsilon_greedy(current_state, approx_model, all_actions,
                                     explanation_features)

    for i in range(10000):

        # Get the human reward:
        h = puddy.get_human_reinf_from_prev_step(current_state,
                                                 all_actions[current_act_ind],
                                                 explanation_features)
        # We assume that the human model is optimal
        #if h != 0:
        # Online learning:
        print(current_state, h, all_actions[current_act_ind])
        approx_model.update(current_state, all_actions[current_act_ind], h)

        # Get the next state based on action (random for the moment)
        new_state = puddy.get_next_state(current_state,
                                         all_actions[current_act_ind])
        current_act_ind = epsilon_greedy(new_state, approx_model, all_actions,
                                         explanation_features)
        #print ("current action", all_actions[current_act_ind])
        current_state = copy.deepcopy(new_state)
        if current_state.is_terminal():
            current_state = puddy.get_initial_state()

    # Test the policy
    current_state = puddy.get_initial_state()
    print("Start from state", current_state)
    print(
        "Best action from tamer",
        all_actions[get_best_action(current_state, approx_model, all_actions,
                                    explanation_features)])
    print("Best action from RL agent", puddy.get_best_action(current_state))
Ejemplo n.º 3
0
def tamer_algorithm():
    puddy = PUDDLER()
    all_actions = puddy.get_possible_actions()
    save_weights_file = 'weights/weights_{}.hdf5'
    load_weights_file = 'weights-test/weights_{}.hdf5'

    # train_action_models(save_weights_file, all_actions, puddy)

    actions_models = load_trained_actions_models(all_actions,
                                                 load_weights_file)

    # explanation_features = [0,0,0]
    # # Test the policy
    # current_state = puddy.return_state(0,0.2) #get_initial_state()
    # print("Start from state", current_state)
    # curr_char = "S"
    # puddy.visualize_agent(current_state)
    # curr_char = raw_input("")
    # while curr_char.lower() != 'n':
    #     a = puddy.get_best_action(current_state, explanation_features)
    #     print("Next action", a)
    #     next_state = puddy.get_next_state(current_state, a)
    #     print("New state", next_state)
    #     current_state = copy.deepcopy(next_state)
    #     puddy.visualize_agent(current_state)
    #     curr_char = raw_input("")

    explanation_features = [0, 0, 0]
    # current_state = puddy.get_initial_state()
    current_state = puddy.return_state(0, 0.2)
    print(
        "Best action from tamer",
        all_actions[get_best_action(current_state, actions_models, all_actions,
                                    explanation_features)])

    puddy.visualize_agent(current_state)
    curr_char = raw_input("")
    while curr_char.lower() != 'n':
        a = all_actions[get_best_action(current_state, actions_models,
                                        all_actions, explanation_features)]
        print("Next action", a)
        next_state = puddy.get_next_state(current_state, a)
        print("New state", next_state)
        current_state = copy.deepcopy(next_state)
        puddy.visualize_agent(current_state)
        curr_char = raw_input("")

    explanation_features = [1]
    current_state = puddy.get_initial_state()
    print(
        "Best action from tamer",
        all_actions[get_best_action(current_state, actions_models, all_actions,
                                    explanation_features)])

    puddy.visualize_agent(current_state)
    curr_char = raw_input("")
    while curr_char.lower() != 'n':
        a = all_actions[get_best_action(current_state, actions_models,
                                        all_actions, explanation_features)]
        print("Next action", a)
        next_state = puddy.get_next_state(current_state, a)
        print("New state", next_state)
        current_state = copy.deepcopy(next_state)
        puddy.visualize_agent(current_state)
        curr_char = raw_input("")