def tamer_algorithm(): weights_file_str = 'weights/weights_{}.hdf5' puddy = PUDDLER() init_state = puddy.get_initial_state() all_actions = puddy.get_possible_actions() current_act_ind = randint(0, len(all_actions) - 1) EPISODE_LIMIT = 40 step_count = 0 episode_number = 0 actions_models, actions_X_train, actions_y_train, aux_X_train, aux_y_train \ = get_action_models_and_training_sets(all_actions) current_state = puddy.get_next_state(init_state, all_actions[current_act_ind]) # current_act_ind = epsilon_greedy(current_state, actions_models, all_actions, explanation_features) start_time = time.time() batch_size = 250 num_iters = 100000 number_of_no_exp = 0 number_of_exp = 0 for i in range(num_iters): prev_best_action = all_actions[current_act_ind] model_name = 'nn_model_{}'.format(prev_best_action) X_train_name = 'X_train_{}'.format(prev_best_action) y_train_name = 'y_train_{}'.format(prev_best_action) explanation_features = choose_random_expln_features() if explanation_features[0] > 0.5: number_of_exp += 1 else: number_of_no_exp += 1 print(explanation_features) step_count += 1 # Get the human reward: h = puddy.get_human_reinf_from_prev_step(current_state, all_actions[current_act_ind], explanation_features) aux_y_train[y_train_name].append(h) print("prev_best_action", current_state, prev_best_action, h) xf = explanation_features aux_X_train[X_train_name].append( [current_state.x, current_state.y, xf[0], xf[1], xf[2]]) actions_X_train[X_train_name] = np.array(aux_X_train[X_train_name]) actions_y_train[y_train_name] = np.array(aux_y_train[y_train_name]) #explanation_features = choose_random_expln_features() # #if explanation_features[0] > 0.5: # number_of_exp += 1 #else: # number_of_no_exp +=1 #print (explanation_features) #step_count += 1 # Get the human reward: #h = puddy.get_human_reinf_from_prev_step(current_state, all_actions[current_act_ind], explanation_features) #aux_y_train[y_train_name].append(h) #print ("prev_best_action",current_state,prev_best_action,h) #xf = explanation_features #aux_X_train[X_train_name].append([current_state.x, current_state.y, xf[0]]) #actions_X_train[X_train_name] = np.array(aux_X_train[X_train_name]) #actions_y_train[y_train_name] = np.array(aux_y_train[y_train_name]) # If have a batch of data ready, train and predict from it # Update the models if we are on a batch_size iteration if i % batch_size == 0: for poss_act in all_actions: train_weights_file = weights_file_str.format(poss_act) train_model_name = 'nn_model_{}'.format(poss_act) train_X_name = 'X_train_{}'.format(poss_act) train_y_name = 'y_train_{}'.format(poss_act) curr_model = actions_models[train_model_name] try: curr_model.load_weights(train_weights_file) except: pass print("----------------------------------") print "IN ITERATION {}".format(i) print("TRAINING {}".format(poss_act)) X_train = actions_X_train[train_X_name] y_train = actions_y_train[train_y_name] if len(X_train) > 0: curr_model.fit(X_train, y_train, nb_epoch=20, batch_size=2) curr_model.save_weights(train_weights_file) else: print("actions ", poss_act) # Get the next state based on action (random for the moment) new_state = puddy.get_next_state(current_state, all_actions[current_act_ind]) # This is the predict part current_act_ind = epsilon_greedy(new_state, actions_models, all_actions, explanation_features, episode_number, step_count) # print ("current action", all_actions[current_act_ind]) current_state = copy.deepcopy(new_state) if current_state.is_terminal() or step_count >= EPISODE_LIMIT: current_state = puddy.get_initial_state() step_count = 0 episode_number += 1 elapsed_time = time.time() - start_time print("------------------------------------------------------------") print(" Elapsed time to train: {}".format(elapsed_time)) print("------------------------------------------------------------") print("No of explanation examples", number_of_exp) print("No of no explanation examples", number_of_no_exp) # ------------ EVAL -------------- # actions_models = load_trained_actions_models(all_actions) # # -------------------------------- # # explanation_features = [0] # # Test the policy # current_state = puddy.get_initial_state() # print("Start from state", current_state) # curr_char = "S" # puddy.visualize_agent(current_state) # curr_char = raw_input("") # while curr_char.lower() != 'n': # a = puddy.get_best_action(current_state, explanation_features) # print("Next action", a) # next_state = puddy.get_next_state(current_state, a) # print("New state", next_state) # current_state = copy.deepcopy(next_state) # puddy.visualize_agent(current_state) # curr_char = raw_input("") # # explanation_features = [0, 0, 0] # Test the policy current_state = puddy.get_initial_state() print("Start from state", current_state) curr_char = "S" puddy.visualize_agent(current_state) curr_char = raw_input("") while curr_char.lower() != 'n': a = puddy.get_best_action(current_state, explanation_features) print("Next action", a) next_state = puddy.get_next_state(current_state, a) print("New state", next_state) current_state = copy.deepcopy(next_state) puddy.visualize_agent(current_state) curr_char = raw_input("") explanation_features = [0, 0, 0] current_state = puddy.get_initial_state() print( "Best action from tamer", all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)]) puddy.visualize_agent(current_state) curr_char = raw_input("") while curr_char.lower() != 'n': a = all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)] print("Next action", a) next_state = puddy.get_next_state(current_state, a) print("New state", next_state) current_state = copy.deepcopy(next_state) puddy.visualize_agent(current_state) curr_char = raw_input("") explanation_features = [0, 0, 1] current_state = puddy.get_initial_state() print( "Best action from tamer", all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)]) puddy.visualize_agent(current_state) curr_char = raw_input("") while curr_char.lower() != 'n': a = all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)] print("Next action", a) next_state = puddy.get_next_state(current_state, a) print("New state", next_state) current_state = copy.deepcopy(next_state) puddy.visualize_agent(current_state) curr_char = raw_input("")
def tamer_algorithm(): puddy = PUDDLER() init_state = puddy.get_initial_state() all_actions = puddy.get_possible_actions() explanation_features = [] #[0,0,0] current_act_ind = randint(0, len(all_actions) - 1) #X_train = np.array([list(init_state.features()) + explanation_features +[current_act_ind]]) #y_train = np.array([puddy.get_human_reinf_from_prev_step(init_state, all_actions[current_act_ind], explanation_features)]) # Fit the values from the data #reg = SGDRegressor(max_iter=100).fit(X_train, y_train) approx_model = LinearFuncApprox(num_features=2, actions=all_actions) approx_model.update( init_state, all_actions[current_act_ind], puddy.get_human_reinf_from_prev_step(init_state, all_actions[current_act_ind])) # s = [x, y, p1, p2, a] # Up: 1, Right: 2, Down: 3, Left: 4 #s = [0.1, 0.1, 1, 0] #a = get_best_action(s) #s.append(a) #np_s = np.array([s]) current_state = puddy.get_next_state(init_state, all_actions[current_act_ind]) current_act_ind = epsilon_greedy(current_state, approx_model, all_actions, explanation_features) for i in range(10000): # Get the human reward: h = puddy.get_human_reinf_from_prev_step(current_state, all_actions[current_act_ind], explanation_features) # We assume that the human model is optimal #if h != 0: # Online learning: print(current_state, h, all_actions[current_act_ind]) approx_model.update(current_state, all_actions[current_act_ind], h) # Get the next state based on action (random for the moment) new_state = puddy.get_next_state(current_state, all_actions[current_act_ind]) current_act_ind = epsilon_greedy(new_state, approx_model, all_actions, explanation_features) #print ("current action", all_actions[current_act_ind]) current_state = copy.deepcopy(new_state) if current_state.is_terminal(): current_state = puddy.get_initial_state() # Test the policy current_state = puddy.get_initial_state() print("Start from state", current_state) print( "Best action from tamer", all_actions[get_best_action(current_state, approx_model, all_actions, explanation_features)]) print("Best action from RL agent", puddy.get_best_action(current_state))
def tamer_algorithm(): puddy = PUDDLER() all_actions = puddy.get_possible_actions() save_weights_file = 'weights/weights_{}.hdf5' load_weights_file = 'weights-test/weights_{}.hdf5' # train_action_models(save_weights_file, all_actions, puddy) actions_models = load_trained_actions_models(all_actions, load_weights_file) # explanation_features = [0,0,0] # # Test the policy # current_state = puddy.return_state(0,0.2) #get_initial_state() # print("Start from state", current_state) # curr_char = "S" # puddy.visualize_agent(current_state) # curr_char = raw_input("") # while curr_char.lower() != 'n': # a = puddy.get_best_action(current_state, explanation_features) # print("Next action", a) # next_state = puddy.get_next_state(current_state, a) # print("New state", next_state) # current_state = copy.deepcopy(next_state) # puddy.visualize_agent(current_state) # curr_char = raw_input("") explanation_features = [0, 0, 0] # current_state = puddy.get_initial_state() current_state = puddy.return_state(0, 0.2) print( "Best action from tamer", all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)]) puddy.visualize_agent(current_state) curr_char = raw_input("") while curr_char.lower() != 'n': a = all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)] print("Next action", a) next_state = puddy.get_next_state(current_state, a) print("New state", next_state) current_state = copy.deepcopy(next_state) puddy.visualize_agent(current_state) curr_char = raw_input("") explanation_features = [1] current_state = puddy.get_initial_state() print( "Best action from tamer", all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)]) puddy.visualize_agent(current_state) curr_char = raw_input("") while curr_char.lower() != 'n': a = all_actions[get_best_action(current_state, actions_models, all_actions, explanation_features)] print("Next action", a) next_state = puddy.get_next_state(current_state, a) print("New state", next_state) current_state = copy.deepcopy(next_state) puddy.visualize_agent(current_state) curr_char = raw_input("")