def __init__( self, discount=0.99, batch_size=256, min_steps_learn=int(1e4), replay_size=int(1e6), training_ratio=256, # data_consumption / data_generation target_update_tau=0.005, # tau=1 for hard update. target_update_interval=1, # interval=1000 for hard update. learning_rate=3e-4, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, action_prior="uniform", # or "gaussian" reward_scale=1, reparameterize=True, clip_grad_norm=1e6, policy_output_regularization=0.001, n_step_return=1, ): if optim_kwargs is None: optim_kwargs = dict() assert action_prior in ["uniform", "gaussian"] save__init__args(locals()) self.update_counter = 0
def __init__( self, batch_size, learning_rate, replay_filepath, delta_T=0, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_state_dict=None, clip_grad_norm=1000., EncoderCls=EncoderModel, encoder_kwargs=None, latent_size=128, ReplayCls=UlForRlReplayBuffer, activation_loss_coefficient=0.0, learning_rate_anneal=None, # cosine learning_rate_warmup=0, # number of updates VaeHeadCls=VaeHeadModel, hidden_sizes=None, # But maybe use for forward prediction DecoderCls=VaeDecoderModel, decoder_kwargs=None, kl_coeff=1., onehot_action=True, validation_split=0.0, n_validation_batches=0, ): optim_kwargs = dict() if optim_kwargs is None else optim_kwargs encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs decoder_kwargs = dict() if decoder_kwargs is None else decoder_kwargs save__init__args(locals()) self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX) assert learning_rate_anneal in [None, "cosine"] self._replay_T = delta_T + 1
def __init__( self, discount=0.99, batch_size=256, min_steps_learn=int(1e4), replay_size=int(1e6), replay_ratio=256, # data_consumption / data_generation target_update_tau=0.005, # tau=1 for hard update. target_update_interval=1, # 1000 for hard update, 1 for soft. learning_rate=3e-4, fixed_alpha=None, # None for adaptive alpha, float for any fixed value OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, # for all of them. action_prior="uniform", # or "gaussian" reward_scale=1, target_entropy="auto", # "auto", float, or None reparameterize=True, clip_grad_norm=1e9, # policy_output_regularization=0.001, n_step_return=1, updates_per_sync=1, # For async mode only. bootstrap_timelimit=False, ReplayBufferCls=None, # Leave None to select by above options. ): """Save input arguments.""" if optim_kwargs is None: optim_kwargs = dict() assert action_prior in ["uniform", "gaussian"] self._batch_size = batch_size del batch_size # Property. save__init__args(locals())
def __init__( self, discount=0.99, learning_rate=0.001, value_loss_coeff=1., entropy_loss_coeff=0.01, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=1., initial_optim_state_dict=None, gae_lambda=1, minibatches=4, epochs=4, ratio_clip=0.1, linear_lr_schedule=True, normalize_advantage=False, clip_vf_loss=False, # Clip VF_loss as in OpenAI? normalize_rewards=None, # Can be 'return' (OpenAI, no mean subtraction), 'reward' (same as obs normalization) or None rew_clip=( -10, 10), # Additional clipping for reward (if normalizing reward) rew_min_var=1e-6 # Minimum variance in running mean for reward (if normalizing reward) ): """Saves input settings.""" if optim_kwargs is None: optim_kwargs = dict(eps=1e-5) save__init__args(locals())
def __init__( self, discount=0.99, learning_rate=0.001, vae_learning_rate=0.0001, value_loss_coeff=1., entropy_loss_coeff=0.01, OptimCls=torch.optim.Adam, optim_kwargs={}, VaeOptimCls=torch.optim.Adam, vae_optim_kwargs={}, clip_grad_norm=1., initial_optim_state_dict=None, gae_lambda=1, minibatches=4, epochs=4, ratio_clip=0.1, linear_lr_schedule=True, vae_linear_lr_schedule=False, normalize_advantage=False, normalize_rewards=True, vae_beta=1, vae_loss_coeff=0.1, vae_loss_type="l2", vae_update_freq=1, alternating_optim=False, ): """Saves input settings.""" save__init__args(locals())
def __init__( self, discount=0.99, learning_rate=1e-4, T_target_steps=100, bootstrap_with_online_model=False, OptimCls=torch.optim.Adam, pop_art_reward_normalization=True, optim_kwargs=None, initial_optim_state_dict=None, minibatches=1, epochs=1, gae_lambda=0.97, discrete_actions=False, epsilon_eta=0.01, epsilon_alpha=0.01, initial_eta=1.0, initial_alpha=5.0, initial_alpha_mu=1.0, initial_alpha_sigma=1.0, epsilon_alpha_mu=0.0075, epsilon_alpha_sigma=1e-5, ): """Saves input settings.""" if optim_kwargs is None: optim_kwargs = dict() self.pop_art_normalizer = PopArtLayer() save__init__args(locals())
def __init__(self, discount=0.99, learning_rate=0.001, value_loss_coeff=1., entropy_loss_coeff=0.01, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=1., initial_optim_state_dict=None, gae_lambda=1, minibatches=4, epochs=4, ratio_clip=0.1, linear_lr_schedule=True, normalize_advantage=False, normalize_reward=False, kernel_params=None, curiosity_type='none'): """Saves input settings.""" if optim_kwargs is None: optim_kwargs = dict() save__init__args(locals()) if self.normalize_reward: self.reward_ff = RewardForwardFilter(discount) self.reward_rms = RunningMeanStd() self.intrinsic_rewards = None if kernel_params is not None: self.mu, self.sigma = self.kernel_params self.kernel_line = lambda x: x self.kernel_gauss = lambda x: np.sign(x) * self.mu * np.exp(-(abs( x) - self.mu)**2 / (2 * self.sigma**2))
def __init__( self, discount=0.99, learning_rate=1e-3, # Main learning rate termination_lr=5e-7, # Termination learning rate pi_omega_lr=0., # policy over options learning rate interest_lr=1e-3, # Learning rate for interest function value_loss_coeff=0.5, termination_loss_coeff=1., # Coefficient for termination loss component entropy_loss_coeff=0.01, # Entropy loss for low-level policy omega_entropy_loss_coeff=0.01, # Entropy loss for policy over options delib_cost=0., # Cost for switching options. Subtracted from rewards after normalization...Also added to termination advantage OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=1., initial_optim_state_dict=None, gae_lambda=1, linear_lr_schedule=True, normalize_advantage=False, normalize_termination_advantage=False, # Normalize termination advantage? Doesn't seem to be done normalize_rewards=None, # Can be 'return' (OpenAI, no mean subtraction), 'reward' (same as obs normalization) or None rew_clip=( -10, 10), # Additional clipping for reward (if normalizing reward) rew_min_var=1e-6 # Minimum variance in running mean for reward (if normalizing reward) ): """Saves input settings.""" if optim_kwargs is None: optim_kwargs = dict(eps=1e-5) save__init__args(locals())
def __init__( self, batch_size, learning_rate, replay_filepath, delta_T=1, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_state_dict=None, clip_grad_norm=10., EncoderCls=EncoderModel, encoder_kwargs=None, ReplayCls=UlForRlReplayBuffer, onehot_actions=True, activation_loss_coefficient=0.0, learning_rate_anneal=None, # cosine learning_rate_warmup=0, # number of updates random_shift_prob=0., random_shift_pad=4, InverseModelCls=InverseModel, inverse_model_kwargs=None, entropy_loss_coeff=0.01, validation_split=0.0, n_validation_batches=0, ): optim_kwargs = dict() if optim_kwargs is None else optim_kwargs encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs inverse_model_kwargs = dict( ) if inverse_model_kwargs is None else inverse_model_kwargs save__init__args(locals()) self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX) assert learning_rate_anneal in [None, "cosine"] assert onehot_actions # needs discrete action space for now. assert delta_T > 0 self._replay_T = delta_T + 1
def __init__( self, ModelCls=PiMlpModel, # Pi model. QModelCls=QofMuMlpModel, model_kwargs=None, # Pi model. q_model_kwargs=None, initial_model_state_dict=None, # Pi model. action_squash=1, # Max magnitude (or None). pretrain_std=0.75, # High value to make near uniform sampling. max_q_eval_mode='none', n_qs=2, ): self._max_q_eval_mode = max_q_eval_mode if isinstance(ModelCls, str): ModelCls = eval(ModelCls) if isinstance(QModelCls, str): QModelCls = eval(QModelCls) if model_kwargs is None: model_kwargs = dict(hidden_sizes=[256, 256]) if q_model_kwargs is None: q_model_kwargs = dict(hidden_sizes=[256, 256]) super().__init__(ModelCls=ModelCls, model_kwargs=model_kwargs, initial_model_state_dict=initial_model_state_dict ) # For async setup. save__init__args(locals()) self.min_itr_learn = 0 # Get from algo. self.log_alpha = None print('n_qs', self.n_qs) global Models Models = namedtuple("Models", ["pi"] + [f"q{i}" for i in range(self.n_qs)])
def __init__( self, discount=0.99, learning_rate=0.001, value_loss_coeff=1., entropy_loss_coeff=0.01, OptimCls=torch.optim.Adam, optim_kwargs=None, VaeOptimCls=torch.optim.Adam, clip_grad_norm=1., initial_optim_state_dict=None, gae_lambda=1, minibatches=4, epochs=4, ratio_clip=0.1, linear_lr_schedule=True, normalize_advantage=False, normalize_rewards=False, similarity_loss=False, similarity_coeff=0.1, ): """Saves input settings.""" if optim_kwargs is None: optim_kwargs = dict() save__init__args(locals())
def __init__( self, replay_filepath, learning_rate, batch_B=64, batch_T=1, delta_T=1, use_global_global=False, use_global_local=True, use_local_local=True, local_conv_layer=1, # 0-based indexing latent_size=256, target_update_tau=0.01, # 1 for hard update target_update_interval=1, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_state_dict=None, clip_grad_norm=100.0, EncoderCls=StDimEncoderModel, encoder_kwargs=None, ReplayCls=UlForRlReplayBuffer, anchor_hidden_sizes=512, activation_loss_coefficient=0.0, learning_rate_anneal=None, # cosine learning_rate_warmup=0, # number of updates validation_split=0.0, n_validation_batches=0, ): optim_kwargs = dict() if optim_kwargs is None else optim_kwargs encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs save__init__args(locals()) self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX) assert learning_rate_anneal in [None, "cosine"] self._replay_T = batch_T + delta_T self.batch_size = batch_B * batch_T # for logging
def __init__( self, replay_filepath, ReplayCls=UlForRlReplayBuffer, delta_T=1, batch_T=1, batch_B=256, learning_rate=1e-3, learning_rate_anneal=None, # cosine learning_rate_warmup=0, # number of updates OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=10., target_update_tau=0.01, # 1 for hard update target_update_interval=1, EncoderCls=EncoderModel, encoder_kwargs=None, latent_size=256, anchor_hidden_sizes=512, initial_state_dict=None, random_shift_prob=1., random_shift_pad=4, activation_loss_coefficient=0., # rarely if ever use validation_split=0.0, n_validation_batches=0, # usually don't do it. ): encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs save__init__args(locals()) assert learning_rate_anneal in [None, "cosine"] self.batch_size = batch_B * batch_T # for logging only self._replay_T = batch_T + delta_T
def __init__(self, ModelCls=None, model_kwargs=None, initial_model_state_dict=None): """ Arguments are saved but no model initialization occurs. Args: ModelCls: The model class to be used. model_kwargs (optional): Any keyword arguments to pass when instantiating the model. initial_model_state_dict (optional): Initial model parameter values. """ save__init__args(locals()) self.model = None # type: torch.nn.Module self.shared_model = None self.distribution = None self.device = torch.device("cpu") self._mode = None if self.model_kwargs is None: self.model_kwargs = dict() # The rest only for async operations: self._rw_lock = RWLock() self._send_count = mp.RawValue("l", 0) self._recv_count = 0
def __init__( self, batch_T, batch_B, learning_rate, replay_filepath, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_state_dict=None, clip_grad_norm=10., EncoderCls=EncoderModel, encoder_kwargs=None, ReplayCls=UlForRlReplayBuffer, onehot_actions=True, activation_loss_coefficient=0.0, learning_rate_anneal=None, # cosine learning_rate_warmup=0, # number of updates PixCtlModelCls=PixelControlModel, pixel_control_model_kwargs=None, pixel_control_filename="pixel_control_80x80_4x4.pkl", # Looks in replay path. validation_split=0.0, n_validation_batches=0, ): optim_kwargs = dict() if optim_kwargs is None else optim_kwargs encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs pixel_control_model_kwargs = (dict() if pixel_control_model_kwargs is None else pixel_control_model_kwargs) save__init__args(locals()) assert learning_rate_anneal in [None, "cosine"] self._replay_T = batch_T self.batch_size = batch_T * batch_B # for logging
def __init__( self, discount=0.99, batch_size=256, min_steps_learn=int(1e4), replay_size=int(1e6), replay_ratio=256, # data_consumption / data_generation target_update_tau=0.005, # tau=1 for hard update. target_update_interval=1, # 1000 for hard update, 1 for soft. learning_rate=3e-4, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, # for all of them. action_prior="uniform", # or "gaussian" reward_scale=1, reparameterize=True, clip_grad_norm=1e9, policy_output_regularization=0.001, n_step_return=1, updates_per_sync=1, # For async mode only. bootstrap_timelimit=True, ): if optim_kwargs is None: optim_kwargs = dict() assert action_prior in ["uniform", "gaussian"] self._batch_size = batch_size del batch_size # Property. save__init__args(locals())
def __init__( self, ModelCls=PiMlpModel, # Pi model. QModelCls=QofMuMlpModel, model_kwargs=None, # Pi model. q_model_kwargs=None, initial_model_state_dict=None, # Pi model. action_squash=1, # Max magnitude (or None). pretrain_std=0.75, # High value to make near uniform sampling. ): if isinstance(ModelCls, str): ModelCls = eval(ModelCls) if isinstance(QModelCls, str): QModelCls = eval(QModelCls) if model_kwargs is None: model_kwargs = dict(hidden_sizes=[256, 256]) if q_model_kwargs is None: q_model_kwargs = dict(hidden_sizes=[256, 256]) super().__init__(ModelCls=ModelCls, model_kwargs=model_kwargs, initial_model_state_dict=initial_model_state_dict ) # For async setup. save__init__args(locals()) self.min_itr_learn = 0 # Get from algo. self.log_alpha = None
def __init__( self, discount=0.99, batch_size=64, min_steps_learn=int(1e3), replay_size=int(1e6), replay_ratio=64, # data_consumption / data_generation target_update_tau=0.01, target_update_interval=1, policy_update_interval=1, learning_rate=1e-4, q_learning_rate=5e-5, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, clip_grad_norm=1e8, q_target_clip=1e6, n_step_return=1, updates_per_sync=1, # For async mode only. bootstrap_timelimit=True, ReplayBufferCls=None, target=False, ): """Saves input arguments.""" if optim_kwargs is None: optim_kwargs = dict() self._batch_size = batch_size del batch_size # Property. save__init__args(locals())
def __init__( self, ModelCls=SacModel, ConvModelCls=SacConvModel, Fc1ModelCls=SacFc1Model, PiModelCls=SacActorModel, QModelCls=SacCriticModel, conv_kwargs=None, fc1_kwargs=None, pi_model_kwargs=None, q_model_kwargs=None, initial_state_dict=None, action_squash=1.0, pretrain_std=0.75, # 0.75 gets pretty uniform squashed actions load_conv=False, load_all=False, state_dict_filename=None, store_latent=False, ): if conv_kwargs is None: conv_kwargs = dict() if fc1_kwargs is None: fc1_kwargs = dict(latent_size=50) # default if pi_model_kwargs is None: pi_model_kwargs = dict(hidden_sizes=[1024, 1024]) # default if q_model_kwargs is None: q_model_kwargs = dict(hidden_sizes=[1024, 1024]) # default save__init__args(locals()) super().__init__(ModelCls=SacModel) self.min_itr_learn = 0 # Get from algo. assert not (load_conv and load_all)
def __init__( self, alpha, beta, gamma, ): save__init__args(locals(), underscore=True)
def __init__( self, discount=0.99, batch_size=64, min_steps_learn=int(1e4), replay_size=int(1e6), replay_ratio=64, # data_consumption / data_generation target_update_tau=0.01, target_update_interval=1, policy_update_interval=1, learning_rate=1e-4, q_learning_rate=1e-3, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, clip_grad_norm=1e8, q_target_clip=1e6, n_step_return=1, updates_per_sync=1, # For async mode only. ): if optim_kwargs is None: optim_kwargs = dict() self._batch_size = batch_size del batch_size # Property. save__init__args(locals())
def __init__( self, discount=0.99, batch_size=500, buffer_size=int(1e6), min_steps_learn=int(1e1), # very efficient target_update_tau=0.9, target_update_interval=5, policy_update_interval=5, learning_rate=1e-2, d_learning_rate=1e-2, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, clip_grad_norm=1e8, d_target_clip=1e6, updates_per_sync=1, # For async mode only. bootstrap_timelimit=True, obs_cost_fn=None ): """Saves input arguments.""" if optim_kwargs is None: optim_kwargs = dict() self._batch_size = batch_size del batch_size # Property. save__init__args(locals())
def __init__( self, batch_B, batch_T, learning_rate, replay_filepath, warmup_T=0, rnn_size=256, latent_size=256, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_state_dict=None, clip_grad_norm=1000., validation_split=0.0, n_validation_batches=0, EncoderCls=EncoderModel, encoder_kwargs=None, ReplayCls=UlForRlReplayBuffer, onehot_actions=True, activation_loss_coefficient=0., # 0 for OFF learning_rate_anneal=None, # cosine learning_rate_warmup=0, # number of updates ): optim_kwargs = dict() if optim_kwargs is None else optim_kwargs encoder_kwargs = dict() if encoder_kwargs is None else encoder_kwargs save__init__args(locals()) self.c_e_loss = torch.nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX) assert learning_rate_anneal in [None, "cosine"] self.batch_size = batch_B * batch_T # for logging only self._replay_T = batch_T + warmup_T
def __init__( self, ModelCls=PiMlpModel, # Pi model. QModelCls=QofMuMlpModel, model_kwargs=None, # Pi model. q_model_kwargs=None, v_model_kwargs=None, initial_model_state_dict=None, # All models. action_squash=1.0, # Max magnitude (or None). pretrain_std=0.75, # With squash 0.75 is near uniform. ): """Saves input arguments; network defaults stored within.""" if model_kwargs is None: model_kwargs = dict(hidden_sizes=[256, 256]) if q_model_kwargs is None: q_model_kwargs = dict(hidden_sizes=[256, 256]) if v_model_kwargs is None: v_model_kwargs = dict(hidden_sizes=[256, 256]) super().__init__( ModelCls=ModelCls, model_kwargs=model_kwargs, initial_model_state_dict=initial_model_state_dict, ) save__init__args(locals()) self.min_itr_learn = 0 # Get from algo.
def __init__( self, discount=0.99, batch_size=32, min_steps_learn=int(5e4), delta_clip=1., replay_size=int(1e6), training_ratio=8, # data_consumption / data_generation. target_update_steps=int(1e4), # Per env steps sampled. n_step_return=1, learning_rate=2.5e-4, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, clip_grad_norm=10., eps_init=1, eps_final=0.01, eps_final_min=None, # set < eps_final to use vector-valued eps. eps_steps=int(1e6), eps_eval=0.001, double_dqn=False, prioritized_replay=False, pri_alpha=0.6, pri_beta_init=0.4, pri_beta_final=1., pri_beta_steps=int(50e6), default_priority=None, ReplayBufferCls=None, # Leave None to select by above options. ): if optim_kwargs is None: optim_kwargs = dict(eps=0.01 / batch_size) if default_priority is None: default_priority = delta_clip save__init__args(locals())
def __init__( self, discount=0.997, lambda_coef=1.0, batch_T=12, # replay trajectory length batch_B=64, warmup_T=0, # originally 40 store_rnn_state_interval=9, # 0 for none, 1 for all. default was 40 min_steps_learn=int(1e5), delta_clip=None, # Typically use squared-error loss (Steven). replay_size=int(1e6), replay_ratio=1, target_update_interval=2500, # (Steven says 2500 but maybe faster.) n_step_return=1, # originally 5, minimum is 1 learning_rate=1e-4, OptimCls=torch.optim.Adam, optim_kwargs=None, initial_optim_state_dict=None, clip_grad_norm=80., # 80 (Steven) eps_steps=int(1e6), # STILL IN ALGO; conver to itr, give to agent. double_dqn=False, # originally True prioritized_replay=True, pri_alpha=0.6, pri_beta_init=0.9, pri_beta_final=0.9, pri_beta_steps=int(50e6), pri_eta=0.9, default_priority=None, input_priorities=False, # default True, not sure what it is used for input_priority_shift=None, value_scale_eps=1e-3, # 1e-3 (Steven). ReplayBufferCls=None, # leave None to select by above options updates_per_sync=1, # For async mode only. ): """ :param discount: :param lambda_coef: lambda return coefficient :param delta_clip: :param target_update_interval: :param learning_rate: :param OptimCls: :param optim_kwargs: :param initial_optim_state_dict: :param clip_grad_norm: :param eps_steps: :param double_dqn: :param value_scale_eps: :param ReplayBufferCls: """ if optim_kwargs is None: optim_kwargs = dict(eps=1e-3) # Assumes Adam. if default_priority is None: default_priority = delta_clip or 1. # if input_priority_shift is None: # only used in prioritized replay and warmup i think NOTE # input_priority_shift = warmup_T // store_rnn_state_interval save__init__args(locals()) self._batch_size = (self.batch_T + self.warmup_T) * self.batch_B
def __init__( self, envs, agent, TrajInfoCls, max_T, max_trajectories=None, ): save__init__args(locals())
def __init__(self, game="pong", frame_skip=4, # Frames per step (>=1). num_img_obs=4, # Number of (past) frames in observation (>=1) - "frame stacking". clip_reward=True, episodic_lives=True, fire_on_reset=False, max_start_noops=30, repeat_action_probability=0., horizon=27000, no_extrinsic=False, no_negative_reward=False, normalize_obs=False, normalize_obs_steps=10000, downsampling_scheme='classical', record_freq=0, record_dir=None ): save__init__args(locals(), underscore=True) # ALE game_path = atari_py.get_game_path(game) if not os.path.exists(game_path): raise IOError("You asked for game {} but path {} does not " " exist".format(game, game_path)) self.ale = atari_py.ALEInterface() self.ale.setFloat(b'repeat_action_probability', repeat_action_probability) self.ale.loadROM(game_path) # Spaces self._action_set = self.ale.getMinimalActionSet() self._action_space = IntBox(low=0, high=len(self._action_set)) if downsampling_scheme == 'classical': self._frame_shape = (84, 84) # (W, H) elif downsampling_scheme == 'new': self._frame_shape = (80, 104) obs_shape = (num_img_obs, self._frame_shape[1], self._frame_shape[0]) self._observation_space = IntBox(low=0, high=255, shape=obs_shape, dtype="uint8") self._max_frame = self.ale.getScreenGrayscale() self._raw_frame_1 = self._max_frame.copy() self._raw_frame_2 = self._max_frame.copy() self._obs = np.zeros(shape=obs_shape, dtype="uint8") # Settings self._has_fire = "FIRE" in self.get_action_meanings() self._has_up = "UP" in self.get_action_meanings() self._horizon = int(horizon) # Recording self.record_env = False # set in samping_process for environment 0 self._record_episode = False self._record_freq = record_freq self._video_dir = os.path.join(record_dir, 'videos') self._frames_dir = os.path.join(self._video_dir, 'frames') self._episode_number = 0 self.reset()
def __init__(self, alpha=0.6, beta=0.4, default_priority=1, unique=False, **kwargs): """Fix the SampleFromReplay length here, so priority tree can track where not to sample (else would have to temporarily subtract from tree every time sampling).""" super().__init__(**kwargs) save__init__args(locals()) assert self.batch_T is not None # Must assign. self.init_priority_tree()
def __init__(self, alpha=0.6, beta=0.4, default_priority=1, unique=False, **kwargs): super().__init__(**kwargs) save__init__args(locals()) self.init_priority_tree()