Ejemplo n.º 1
0
    def __init__(self, fqi, fe, epsilon=0.05):
        """
        Creates an epsilon-greedy policy from the given FQI policy object. 
        :param fqi: an FQI instance from the ifqi package
        :param fe: a feature extractor (method s_features(x) is expected)
        :param epsilon: exploration rate for the policy (0 <= epsilon <= 1)
        """
        self.epsilon = epsilon
        self.fqi = None
        self.actions = None
        self.fe = None

        self.load_fe(fe)

        if isinstance(fqi, dict):
            self.fqi = FQI(**fqi)
            self.actions = fqi['discrete_actions']
        else:
            self.load_fqi(fqi)
Ejemplo n.º 2
0
check_dataset(dataset, state_dim, action_dim, reward_dim) # this is just a
# check, it can be removed in experiments
print('Dataset has %d samples' % dataset.shape[0])

# reward_idx = state_dim + action_dim
# sast = np.append(dataset[:, :reward_idx],
#                  dataset[:, reward_idx + reward_dim:-1],
#                  axis=1)
# r = dataset[:, reward_idx]
sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim)

fqi_iterations = mdp.horizon  # this is usually less than the horizon
fqi = FQI(estimator=regressor,
          state_dim=state_dim,
          action_dim=action_dim,
          discrete_actions=discrete_actions,
          gamma=mdp.gamma,
          horizon=fqi_iterations,
          verbose=True)

fit_params = {}
# fit_params = {
#     "n_epochs": 300,
#     "batch_size": 50,
#     "validation_split": 0.1,
#     "verbosity": False,
#     "criterion": "mse"
# }

fqi.partial_fit(sast, r, **fit_params)
Ejemplo n.º 3
0
    regressor = ActionRegressor(regressor,
                                discrete_actions=selected_actions_values,
                                tol=0.5,
                                **fqi_regressor_params)

    # Create FQI model
    fqi_params = {
        'estimator': regressor,
        'state_dim': selected_states_dim,
        'action_dim': selected_actions_dim,
        'discrete_actions': selected_actions_values,
        'gamma': mdp.gamma,
        'horizon': args.iterations,
        'verbose': True
    }
    fqi = FQI(**fqi_params)

    # Run FQI
    print('Running FQI...')
    print('Evaluating policy using model at %s' % args.path)
    fqi_time = time.time()  # Save this for logging

    average_episode_duration = len(dataset) / np.sum(dataset[:, -1])
    iteration_values = []  # Stores performance of the policy at each step
    fqi_fit_params = {}  # Optional parameters for fitting FQI
    fqi_evaluation_params = {
        'metric': 'cumulative',
        'n_episodes': 1,
        'selected_states': selected_states,
        'max_ep_len': 2 * average_episode_duration
    }
Ejemplo n.º 4
0
    regressor = Regressor(regressor_class=ExtraTreesRegressor,
                          **fqi_regressor_params)
    regressor = ActionRegressor(regressor,
                                discrete_actions=selected_actions_values,
                                tol=0.5,
                                **fqi_regressor_params)

    # Create FQI model
    fqi_params = {'estimator': regressor,
                  'state_dim': selected_states_dim,
                  'action_dim': selected_actions_dim,
                  'discrete_actions': selected_actions_values,
                  'gamma': mdp.gamma,
                  'horizon': args.iterations,
                  'verbose': True}
    fqi = FQI(**fqi_params)

    # Run FQI
    print('Running FQI...')
    print('Evaluating policy using model at %s' % args.path)
    fqi_time = time.time()  # Save this for logging

    average_episode_duration = len(dataset) / np.sum(dataset[:, -1])
    iteration_values = []  # Stores performance of the policy at each step
    fqi_fit_params = {}  # Optional parameters for fitting FQI
    fqi_evaluation_params = {'metric': 'cumulative',
                             'n_episodes': 1,
                             'selected_states': selected_states,
                             'max_ep_len': 2 * average_episode_duration}

    # Fit FQI
Ejemplo n.º 5
0
check_dataset(dataset, state_dim, action_dim, reward_dim)  # this is just a
# check, it can be removed in experiments
print('Dataset has %d samples' % dataset.shape[0])

# reward_idx = state_dim + action_dim
# sast = np.append(dataset[:, :reward_idx],
#                  dataset[:, reward_idx + reward_dim:-1],
#                  axis=1)
# r = dataset[:, reward_idx]
sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim)

fqi_iterations = mdp.horizon  # this is usually less than the horizon
fqi = FQI(estimator=regressor,
          state_dim=state_dim,
          action_dim=action_dim,
          discrete_actions=discrete_actions,
          gamma=mdp.gamma,
          horizon=fqi_iterations,
          verbose=True)

fit_params = {}
# fit_params = {
#     "n_epochs": 300,
#     "batch_size": 50,
#     "validation_split": 0.1,
#     "verbosity": False,
#     "criterion": "mse"
# }

fqi.partial_fit(sast, r, **fit_params)
Ejemplo n.º 6
0
# Run
for e in range(config['experiment_setting']['evaluation']['n_experiments']):
    print('Experiment: %d' % (e + 1))
    experiment_results = list()

    # Load dataset
    dataset = evaluation.collect_episodes(
        mdp, n_episodes=np.sort(config['experiment_setting']['evaluation']
                                ['n_episodes'])[-1])
    print('Dataset has %d samples' % dataset.shape[0])

    # Load FQI
    fqi = FQI(estimator=regressor,
              state_dim=state_dim,
              action_dim=action_dim,
              discrete_actions=discrete_actions,
              gamma=config['fqi']['gamma'],
              horizon=config['fqi']['horizon'],
              verbose=config['fqi']['verbose'])
    fit_params = config['fit_params']

    if config['experiment_setting']['evaluation']['metric'] == 'n_episodes':
        for i in config['experiment_setting']['evaluation']['n_episodes']:
            episode_end_idxs = np.argwhere(dataset[:, -1] == 1).ravel()
            last_el = episode_end_idxs[i - 1]
            sast, r = split_data_for_fqi(dataset, state_dim, action_dim,
                                         reward_dim, last_el + 1)

            fqi.fit(sast, r, **fit_params)

            experiment_results.append(evaluate(mdp, fqi, mdp.initial_states, args))
Ejemplo n.º 7
0
class EpsilonFQI:
    def __init__(self, fqi, fe, epsilon=0.05):
        """
        Creates an epsilon-greedy policy from the given FQI policy object. 
        :param fqi: an FQI instance from the ifqi package
        :param fe: a feature extractor (method s_features(x) is expected)
        :param epsilon: exploration rate for the policy (0 <= epsilon <= 1)
        """
        self.epsilon = epsilon
        self.fqi = None
        self.actions = None
        self.fe = None

        self.load_fe(fe)

        if isinstance(fqi, dict):
            self.fqi = FQI(**fqi)
            self.actions = fqi['discrete_actions']
        else:
            self.load_fqi(fqi)

    def fit(self, sast, r, **kwargs):
        """
        Fits the policy on the given sast, r dataset, for the amounts of iterations
        defined when creating FQI.
        :param sast: a dataset of (state, action, state, terminal) transitions
        :param r: the rewards corresponding to the transitions in sast
        :param kwargs: additional arguments for the fit() function of FQI
        """
        self.fqi.fit(sast, r, **kwargs)

    def partial_fit(self, sast=None, r=None, **kwargs):
        """
        Fits the policy on the given sast, r dataset, for one iteration.
        :param sast: a dataset of (state, action, state, terminal) transitions
        :param r: the rewards corresponding to the transitions in sast
        :param kwargs: additional arguments for the partial_fit() function of FQI
        """
        self.fqi.partial_fit(sast, r, **kwargs)

    def draw_action(self,
                    state,
                    absorbing,
                    evaluation=False,
                    fully_deterministic=False):
        """
        Picks an action according to the epsilon-greedy choice
        :param state: a state
        :param absorbing: bool, whether the state is absorbing
        :param evaluation: bool, whether to use the epsilon defined for evaluation
        :param fully_deterministic: whether to use FQI, deterministically, to
        select the action
        :return: the selected action
        """
        if not fully_deterministic and random() <= self.epsilon:
            return choice(self.actions)
        else:
            preprocessed_state = self.fe.s_features(state)
            return self.fqi.draw_action(preprocessed_state,
                                        absorbing,
                                        evaluation=evaluation)

    def set_epsilon(self, epsilon):
        """
        :param epsilon: the exploration rate to use 
        """
        self.epsilon = epsilon

    def get_epsilon(self):
        """
        :return: the current exploration rate 
        """
        return self.epsilon

    def load_fqi(self, fqi):
        """
        Loads an FQI policy from file, sets the action space accordingly
        :param fqi: str or file-like object from which to load the policy
        """
        if isinstance(fqi, str):
            self.fqi = joblib.load(fqi)
        else:
            self.fqi = fqi
        # Set the correct action space
        self.actions = self.fqi._actions

    def save_fqi(self, filename):
        """
        Saves the FQI object to file
        :param filename: filename to which save the model
        """
        joblib.dump(self.fqi, filename)

    def load_fe(self, fe):
        """
        Loads the feature extractor from file
        :param fe: str or file-like object from which to load the model
        """
        if isinstance(fe, str):
            self.fe = joblib.load(fe)
            self.fe.load(fe)
        else:
            self.fe = fe

    def save_fe(self, filename):
        """
        Saves the feature extractor to file
        :param filename: filename to which save the model
        """
        if hasattr(self.fe, 'save'):
            self.fe.save(filename)
        else:
            joblib.dump(self.fe, filename)