Esempio n. 1
0
def performer(solver, args, render=False):
    my_simulator = SIMULATOR()
    state = my_simulator.reset()
    episode_reward = 0

    if render:
        my_simulator.render()

    for i in range(MAX_EPISODE_LENGTH):
        action, _ = solver.search(state, args)

        state, reward, terminal = my_simulator.step(action)

        if render:
            print(SIMULATOR.ACTIONS[action], reward)
            my_simulator.render()

        episode_reward += reward
        if terminal:
            break

    return episode_reward
def run_exper(model, steps, get_features, pre_proc_features):
    from environment import SIMULATOR

    # initializing our environment
    my_sim = SIMULATOR()

    # beginning of an episode
    state_temp = my_sim.reset()
    observation = my_sim.state_to_tensor(state_temp)
    r_tup, e_tup, rover_poss = [], [], []
    # main loop
    prev_input = None
    total_moves = 0
    MAX_MOVES = 25
    for i in range(steps):
        total_moves += 1
        start = time.perf_counter()
        cur_input = observation
        x = cur_input.astype(
            np.float).ravel() if prev_input is not None else np.zeros(70)
        x = x[10:80] if prev_input is not None else x
        x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)])
        x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)])

        x, rov_pos = get_rover_pos(x, r_tup, e_tup, rover_poss)
        x = np.array(x)
        rover_poss.append(rov_pos)
        """
        x = x[x != 0]
        if(len(x) == 1):
            x = np.zeros(4)
            x = x.tolist()
            x.append(-7.)
            x = np.array(x)
        """
        #print_map(x)
        x_t = pre_proc_features.fit_transform(x.reshape(-1, 1))
        x_t = x_t.reshape(1, INPUT_SIZE)[0]
        print("Shape = ", x_t.shape)
        prev_input = cur_input

        # forward the policy network and sample action according to the proba distribution
        #print_map(x)
        proba = model.predict(np.expand_dims(x_t, axis=1).T)
        end = time.perf_counter()
        action = proba[0].argmax()
        print("Time taken = ", end - start)

        #run one step
        state_temp, reward, done, r_tup, e_tup = my_sim.step(action)
        observation = my_sim.state_to_tensor(state_temp)
        my_sim.render()
        time.sleep(1)

        if total_moves == MAX_MOVES:
            total_moves = 0
            done = True
        # if episode is over, reset to beginning
        if done:
            state_temp = my_sim.reset()
            observation = my_sim.state_to_tensor(state_temp)
            my_sim.render()
            rover_poss = []
    action = mprob.argmax() if select == MODEL else rprob.argmax()
    y = mprob if select == MODEL else rprob
    # categorical action labels
    y_data = [0.0] * 4
    y_data[action] = 1.0

    x_train.append(x_t)
    y_train.append(y_data)
    y_pred.append(y[action])
    print("Action = {0}; model = {1}; ".format(
        action, "DL" if select == MODEL else "RND"))

    # do one step in our environment
    state_temp, reward, done, r_tup, e_tup = my_sim.step(action)
    observation = my_sim.state_to_tensor(state_temp)
    my_sim.render()
    rewards.append(float(reward))
    reward_sum += float(reward)
    if (reward_sum < MAX_NEG_REWARD):
        done = True

    # end of an episode
    if done:
        if (reward_sum > 0):
            print(
                'At the end of episode {1} the total reward was : {1}'.format(
                    episode_nb, reward_sum))

        # increment episode number
        episode_nb += 1