Exemple #1
0
    def __init__(self, fqi, fe, epsilon=0.05):
        """
        Creates an epsilon-greedy policy from the given FQI policy object. 
        :param fqi: an FQI instance from the ifqi package
        :param fe: a feature extractor (method s_features(x) is expected)
        :param epsilon: exploration rate for the policy (0 <= epsilon <= 1)
        """
        self.epsilon = epsilon
        self.fqi = None
        self.actions = None
        self.fe = None

        self.load_fe(fe)

        if isinstance(fqi, dict):
            self.fqi = FQI(**fqi)
            self.actions = fqi['discrete_actions']
        else:
            self.load_fqi(fqi)
Exemple #2
0
    regressor = ActionRegressor(regressor,
                                discrete_actions=selected_actions_values,
                                tol=0.5,
                                **fqi_regressor_params)

    # Create FQI model
    fqi_params = {
        'estimator': regressor,
        'state_dim': selected_states_dim,
        'action_dim': selected_actions_dim,
        'discrete_actions': selected_actions_values,
        'gamma': mdp.gamma,
        'horizon': args.iterations,
        'verbose': True
    }
    fqi = FQI(**fqi_params)

    # Run FQI
    print('Running FQI...')
    print('Evaluating policy using model at %s' % args.path)
    fqi_time = time.time()  # Save this for logging

    average_episode_duration = len(dataset) / np.sum(dataset[:, -1])
    iteration_values = []  # Stores performance of the policy at each step
    fqi_fit_params = {}  # Optional parameters for fitting FQI
    fqi_evaluation_params = {
        'metric': 'cumulative',
        'n_episodes': 1,
        'selected_states': selected_states,
        'max_ep_len': 2 * average_episode_duration
    }
Exemple #3
0
check_dataset(dataset, state_dim, action_dim, reward_dim)  # this is just a
# check, it can be removed in experiments
print('Dataset has %d samples' % dataset.shape[0])

# reward_idx = state_dim + action_dim
# sast = np.append(dataset[:, :reward_idx],
#                  dataset[:, reward_idx + reward_dim:-1],
#                  axis=1)
# r = dataset[:, reward_idx]
sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim)

fqi_iterations = mdp.horizon  # this is usually less than the horizon
fqi = FQI(estimator=regressor,
          state_dim=state_dim,
          action_dim=action_dim,
          discrete_actions=discrete_actions,
          gamma=mdp.gamma,
          horizon=fqi_iterations,
          verbose=True)

fit_params = {}
# fit_params = {
#     "n_epochs": 300,
#     "batch_size": 50,
#     "validation_split": 0.1,
#     "verbosity": False,
#     "criterion": "mse"
# }

fqi.partial_fit(sast, r, **fit_params)
Exemple #4
0
# Run
for e in range(config['experiment_setting']['evaluation']['n_experiments']):
    print('Experiment: %d' % (e + 1))
    experiment_results = list()

    # Load dataset
    dataset = evaluation.collect_episodes(
        mdp, n_episodes=np.sort(config['experiment_setting']['evaluation']
                                ['n_episodes'])[-1])
    print('Dataset has %d samples' % dataset.shape[0])

    # Load FQI
    fqi = FQI(estimator=regressor,
              state_dim=state_dim,
              action_dim=action_dim,
              discrete_actions=discrete_actions,
              gamma=config['fqi']['gamma'],
              horizon=config['fqi']['horizon'],
              verbose=config['fqi']['verbose'])
    fit_params = config['fit_params']

    if config['experiment_setting']['evaluation']['metric'] == 'n_episodes':
        for i in config['experiment_setting']['evaluation']['n_episodes']:
            episode_end_idxs = np.argwhere(dataset[:, -1] == 1).ravel()
            last_el = episode_end_idxs[i - 1]
            sast, r = split_data_for_fqi(dataset, state_dim, action_dim,
                                         reward_dim, last_el + 1)

            fqi.fit(sast, r, **fit_params)

            experiment_results.append(evaluate(mdp, fqi, mdp.initial_states, args))