def main(args): np.random.seed(args.seed) tf.set_random_seed(args.seed) env = gym.make(f'{args.env}-{args.quality}-v0') dataset = env.get_dataset() obs_dim = dataset['observations'].shape[1] act_dim = dataset['actions'].shape[1] model = construct_model(obs_dim=obs_dim, act_dim=act_dim, hidden_dim=args.hidden_dim, num_networks=args.num_networks, num_elites=args.num_elites, model_type=args.model_type, separate_mean_var=args.separate_mean_var, name=model_name(args)) dataset['rewards'] = np.expand_dims(dataset['rewards'], 1) train_inputs, train_outputs = format_samples_for_training(dataset) model.train(train_inputs, train_outputs, batch_size=args.batch_size, holdout_ratio=args.holdout_ratio, max_epochs=args.max_epochs, max_t=args.max_t) model.save(args.model_dir, 0)
def main(args): np.random.seed(args.seed) tf.set_random_seed(args.seed) tester.configure(task_name='model_learn', private_config_path=os.path.join(get_package_path(), 'rla_config.yaml'), run_file='train_model_offline.py', log_root=get_package_path()) tester.log_files_gen() tester.print_args() env = gym.make('{}-{}-v0'.format(args.env, args.quality)) dataset = d4rl.qlearning_dataset(env) # env.qlearning_dataset() obs_dim = dataset['observations'].shape[1] act_dim = dataset['actions'].shape[1] model = construct_model(obs_dim=obs_dim, act_dim=act_dim, hidden_dim=args.hidden_dim, num_networks=args.num_networks, num_elites=args.num_elites, model_type=args.model_type, separate_mean_var=args.separate_mean_var, name=model_name(args)) dataset['rewards'] = np.expand_dims(dataset['rewards'], 1) train_inputs, train_outputs = format_samples_for_training(dataset) model.train(train_inputs, train_outputs, batch_size=args.batch_size, holdout_ratio=args.holdout_ratio, max_epochs=args.max_epochs, max_t=args.max_t) model.save(args.model_dir, 0)
def __init__( self, training_environment, evaluation_environment, policy, Qs, pool, static_fns, plotter=None, tf_summaries=False, lr=3e-4, reward_scale=1.0, target_entropy='auto', discount=0.99, tau=5e-3, target_update_interval=1, action_prior='uniform', reparameterize=False, store_extra_policy_info=False, adapt=False, gru_state_dim=256, network_kwargs=None, deterministic=False, rollout_random=False, model_train_freq=250, num_networks=7, num_elites=5, model_retain_epochs=20, rollout_batch_size=100e3, real_ratio=0.1, # rollout_schedule=[20,100,1,1], rollout_length=1, hidden_dim=200, max_model_t=None, model_type='mlp', separate_mean_var=False, identity_terminal=0, pool_load_path='', pool_load_max_size=0, model_name=None, model_load_dir=None, penalty_coeff=0., penalty_learned_var=False, **kwargs): """ Args: env (`SoftlearningEnv`): Environment used for training. policy: A policy function approximator. initial_exploration_policy: ('Policy'): A policy that we use for initial exploration which is not trained by the algorithm. Qs: Q-function approximators. The min of these approximators will be used. Usage of at least two Q-functions improves performance by reducing overestimation bias. pool (`PoolBase`): Replay pool to add gathered samples to. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. lr (`float`): Learning rate used for the function approximators. discount (`float`): Discount factor for Q-function updates. tau (`float`): Soft value function target update weight. target_update_interval ('int'): Frequency at which target network updates occur in iterations. reparameterize ('bool'): If True, we use a gradient estimator for the policy derived using the reparameterization trick. We use a likelihood ratio based estimator otherwise. """ super(MOPO, self).__init__(**kwargs) print("[ DEBUG ]: model name: {}".format(model_name)) if '_smv' in model_name: self._env_name = model_name[:-8] + '-v0' else: self._env_name = model_name[:-4] + '-v0' if self._env_name in infos.REF_MIN_SCORE: self.min_ret = infos.REF_MIN_SCORE[self._env_name] self.max_ret = infos.REF_MAX_SCORE[self._env_name] else: self.min_ret = self.max_ret = 0 obs_dim = np.prod(training_environment.active_observation_shape) act_dim = np.prod(training_environment.action_space.shape) self._model_type = model_type self._identity_terminal = identity_terminal self._model = construct_model(obs_dim=obs_dim, act_dim=act_dim, hidden_dim=hidden_dim, num_networks=num_networks, num_elites=num_elites, model_type=model_type, separate_mean_var=separate_mean_var, name=model_name, load_dir=model_load_dir, deterministic=deterministic) print('[ MOPO ]: got self._model') self._static_fns = static_fns self.fake_env = FakeEnv(self._model, self._static_fns, penalty_coeff=penalty_coeff, penalty_learned_var=penalty_learned_var) self._rollout_schedule = [20, 100, rollout_length, rollout_length] self._max_model_t = max_model_t self._model_retain_epochs = model_retain_epochs self._model_train_freq = model_train_freq self._rollout_batch_size = int(rollout_batch_size) self._deterministic = deterministic self._rollout_random = rollout_random self._real_ratio = real_ratio # TODO: RLA writer (implemented with tf) should be compatible with the Writer object (implemented with tbx) self._log_dir = tester.log_dir # self._writer = tester.writer self._writer = Writer(self._log_dir) self._training_environment = training_environment self._evaluation_environment = evaluation_environment self.gru_state_dim = gru_state_dim self.network_kwargs = network_kwargs self.adapt = adapt self.optim_alpha = False # self._policy = policy # self._Qs = Qs # self._Q_targets = tuple(tf.keras.models.clone_model(Q) for Q in Qs) self._pool = pool self._plotter = plotter self._tf_summaries = tf_summaries self._policy_lr = lr self._Q_lr = lr self._reward_scale = reward_scale self._target_entropy = ( -np.prod(self._training_environment.action_space.shape) if target_entropy == 'auto' else target_entropy) print('[ MOPO ] Target entropy: {}'.format(self._target_entropy)) self._discount = discount self._tau = tau self._target_update_interval = target_update_interval self._action_prior = action_prior self._reparameterize = reparameterize self._store_extra_policy_info = store_extra_policy_info observation_shape = self._training_environment.active_observation_shape action_shape = self._training_environment.action_space.shape assert len(observation_shape) == 1, observation_shape self._observation_shape = observation_shape assert len(action_shape) == 1, action_shape self._action_shape = action_shape self._build() #### load replay pool data self._pool_load_path = pool_load_path self._pool_load_max_size = pool_load_max_size loader.restore_pool(self._pool, self._pool_load_path, self._pool_load_max_size, save_path=self._log_dir) self._init_pool_size = self._pool.size print('[ MOPO ] Starting with pool size: {}'.format( self._init_pool_size))