def performer(solver, args, render=False): my_simulator = SIMULATOR() state = my_simulator.reset() episode_reward = 0 if render: my_simulator.render() for i in range(MAX_EPISODE_LENGTH): action, _ = solver.search(state, args) state, reward, terminal = my_simulator.step(action) if render: print(SIMULATOR.ACTIONS[action], reward) my_simulator.render() episode_reward += reward if terminal: break return episode_reward
def run_exper(model, steps, get_features, pre_proc_features): from environment import SIMULATOR # initializing our environment my_sim = SIMULATOR() # beginning of an episode state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) r_tup, e_tup, rover_poss = [], [], [] # main loop prev_input = None total_moves = 0 MAX_MOVES = 25 for i in range(steps): total_moves += 1 start = time.perf_counter() cur_input = observation x = cur_input.astype( np.float).ravel() if prev_input is not None else np.zeros(70) x = x[10:80] if prev_input is not None else x x = np.array([x[i] for i in range(len(x)) if not (i % 10 == 0)]) x = np.array([x[i] for i in range(len(x)) if not ((i - 8) % 9 == 0)]) x, rov_pos = get_rover_pos(x, r_tup, e_tup, rover_poss) x = np.array(x) rover_poss.append(rov_pos) """ x = x[x != 0] if(len(x) == 1): x = np.zeros(4) x = x.tolist() x.append(-7.) x = np.array(x) """ #print_map(x) x_t = pre_proc_features.fit_transform(x.reshape(-1, 1)) x_t = x_t.reshape(1, INPUT_SIZE)[0] print("Shape = ", x_t.shape) prev_input = cur_input # forward the policy network and sample action according to the proba distribution #print_map(x) proba = model.predict(np.expand_dims(x_t, axis=1).T) end = time.perf_counter() action = proba[0].argmax() print("Time taken = ", end - start) #run one step state_temp, reward, done, r_tup, e_tup = my_sim.step(action) observation = my_sim.state_to_tensor(state_temp) my_sim.render() time.sleep(1) if total_moves == MAX_MOVES: total_moves = 0 done = True # if episode is over, reset to beginning if done: state_temp = my_sim.reset() observation = my_sim.state_to_tensor(state_temp) my_sim.render() rover_poss = []
action = mprob.argmax() if select == MODEL else rprob.argmax() y = mprob if select == MODEL else rprob # categorical action labels y_data = [0.0] * 4 y_data[action] = 1.0 x_train.append(x_t) y_train.append(y_data) y_pred.append(y[action]) print("Action = {0}; model = {1}; ".format( action, "DL" if select == MODEL else "RND")) # do one step in our environment state_temp, reward, done, r_tup, e_tup = my_sim.step(action) observation = my_sim.state_to_tensor(state_temp) my_sim.render() rewards.append(float(reward)) reward_sum += float(reward) if (reward_sum < MAX_NEG_REWARD): done = True # end of an episode if done: if (reward_sum > 0): print( 'At the end of episode {1} the total reward was : {1}'.format( episode_nb, reward_sum)) # increment episode number episode_nb += 1