def __init__(self, fqi, fe, epsilon=0.05): """ Creates an epsilon-greedy policy from the given FQI policy object. :param fqi: an FQI instance from the ifqi package :param fe: a feature extractor (method s_features(x) is expected) :param epsilon: exploration rate for the policy (0 <= epsilon <= 1) """ self.epsilon = epsilon self.fqi = None self.actions = None self.fe = None self.load_fe(fe) if isinstance(fqi, dict): self.fqi = FQI(**fqi) self.actions = fqi['discrete_actions'] else: self.load_fqi(fqi)
check_dataset(dataset, state_dim, action_dim, reward_dim) # this is just a # check, it can be removed in experiments print('Dataset has %d samples' % dataset.shape[0]) # reward_idx = state_dim + action_dim # sast = np.append(dataset[:, :reward_idx], # dataset[:, reward_idx + reward_dim:-1], # axis=1) # r = dataset[:, reward_idx] sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim) fqi_iterations = mdp.horizon # this is usually less than the horizon fqi = FQI(estimator=regressor, state_dim=state_dim, action_dim=action_dim, discrete_actions=discrete_actions, gamma=mdp.gamma, horizon=fqi_iterations, verbose=True) fit_params = {} # fit_params = { # "n_epochs": 300, # "batch_size": 50, # "validation_split": 0.1, # "verbosity": False, # "criterion": "mse" # } fqi.partial_fit(sast, r, **fit_params)
regressor = ActionRegressor(regressor, discrete_actions=selected_actions_values, tol=0.5, **fqi_regressor_params) # Create FQI model fqi_params = { 'estimator': regressor, 'state_dim': selected_states_dim, 'action_dim': selected_actions_dim, 'discrete_actions': selected_actions_values, 'gamma': mdp.gamma, 'horizon': args.iterations, 'verbose': True } fqi = FQI(**fqi_params) # Run FQI print('Running FQI...') print('Evaluating policy using model at %s' % args.path) fqi_time = time.time() # Save this for logging average_episode_duration = len(dataset) / np.sum(dataset[:, -1]) iteration_values = [] # Stores performance of the policy at each step fqi_fit_params = {} # Optional parameters for fitting FQI fqi_evaluation_params = { 'metric': 'cumulative', 'n_episodes': 1, 'selected_states': selected_states, 'max_ep_len': 2 * average_episode_duration }
regressor = Regressor(regressor_class=ExtraTreesRegressor, **fqi_regressor_params) regressor = ActionRegressor(regressor, discrete_actions=selected_actions_values, tol=0.5, **fqi_regressor_params) # Create FQI model fqi_params = {'estimator': regressor, 'state_dim': selected_states_dim, 'action_dim': selected_actions_dim, 'discrete_actions': selected_actions_values, 'gamma': mdp.gamma, 'horizon': args.iterations, 'verbose': True} fqi = FQI(**fqi_params) # Run FQI print('Running FQI...') print('Evaluating policy using model at %s' % args.path) fqi_time = time.time() # Save this for logging average_episode_duration = len(dataset) / np.sum(dataset[:, -1]) iteration_values = [] # Stores performance of the policy at each step fqi_fit_params = {} # Optional parameters for fitting FQI fqi_evaluation_params = {'metric': 'cumulative', 'n_episodes': 1, 'selected_states': selected_states, 'max_ep_len': 2 * average_episode_duration} # Fit FQI
# Run for e in range(config['experiment_setting']['evaluation']['n_experiments']): print('Experiment: %d' % (e + 1)) experiment_results = list() # Load dataset dataset = evaluation.collect_episodes( mdp, n_episodes=np.sort(config['experiment_setting']['evaluation'] ['n_episodes'])[-1]) print('Dataset has %d samples' % dataset.shape[0]) # Load FQI fqi = FQI(estimator=regressor, state_dim=state_dim, action_dim=action_dim, discrete_actions=discrete_actions, gamma=config['fqi']['gamma'], horizon=config['fqi']['horizon'], verbose=config['fqi']['verbose']) fit_params = config['fit_params'] if config['experiment_setting']['evaluation']['metric'] == 'n_episodes': for i in config['experiment_setting']['evaluation']['n_episodes']: episode_end_idxs = np.argwhere(dataset[:, -1] == 1).ravel() last_el = episode_end_idxs[i - 1] sast, r = split_data_for_fqi(dataset, state_dim, action_dim, reward_dim, last_el + 1) fqi.fit(sast, r, **fit_params) experiment_results.append(evaluate(mdp, fqi, mdp.initial_states, args))
class EpsilonFQI: def __init__(self, fqi, fe, epsilon=0.05): """ Creates an epsilon-greedy policy from the given FQI policy object. :param fqi: an FQI instance from the ifqi package :param fe: a feature extractor (method s_features(x) is expected) :param epsilon: exploration rate for the policy (0 <= epsilon <= 1) """ self.epsilon = epsilon self.fqi = None self.actions = None self.fe = None self.load_fe(fe) if isinstance(fqi, dict): self.fqi = FQI(**fqi) self.actions = fqi['discrete_actions'] else: self.load_fqi(fqi) def fit(self, sast, r, **kwargs): """ Fits the policy on the given sast, r dataset, for the amounts of iterations defined when creating FQI. :param sast: a dataset of (state, action, state, terminal) transitions :param r: the rewards corresponding to the transitions in sast :param kwargs: additional arguments for the fit() function of FQI """ self.fqi.fit(sast, r, **kwargs) def partial_fit(self, sast=None, r=None, **kwargs): """ Fits the policy on the given sast, r dataset, for one iteration. :param sast: a dataset of (state, action, state, terminal) transitions :param r: the rewards corresponding to the transitions in sast :param kwargs: additional arguments for the partial_fit() function of FQI """ self.fqi.partial_fit(sast, r, **kwargs) def draw_action(self, state, absorbing, evaluation=False, fully_deterministic=False): """ Picks an action according to the epsilon-greedy choice :param state: a state :param absorbing: bool, whether the state is absorbing :param evaluation: bool, whether to use the epsilon defined for evaluation :param fully_deterministic: whether to use FQI, deterministically, to select the action :return: the selected action """ if not fully_deterministic and random() <= self.epsilon: return choice(self.actions) else: preprocessed_state = self.fe.s_features(state) return self.fqi.draw_action(preprocessed_state, absorbing, evaluation=evaluation) def set_epsilon(self, epsilon): """ :param epsilon: the exploration rate to use """ self.epsilon = epsilon def get_epsilon(self): """ :return: the current exploration rate """ return self.epsilon def load_fqi(self, fqi): """ Loads an FQI policy from file, sets the action space accordingly :param fqi: str or file-like object from which to load the policy """ if isinstance(fqi, str): self.fqi = joblib.load(fqi) else: self.fqi = fqi # Set the correct action space self.actions = self.fqi._actions def save_fqi(self, filename): """ Saves the FQI object to file :param filename: filename to which save the model """ joblib.dump(self.fqi, filename) def load_fe(self, fe): """ Loads the feature extractor from file :param fe: str or file-like object from which to load the model """ if isinstance(fe, str): self.fe = joblib.load(fe) self.fe.load(fe) else: self.fe = fe def save_fe(self, filename): """ Saves the feature extractor to file :param filename: filename to which save the model """ if hasattr(self.fe, 'save'): self.fe.save(filename) else: joblib.dump(self.fe, filename)