Ejemplo n.º 1
0
    def __init__(self, q_function, optimizer,
                 t_max, gamma, i_target, explorer, phi=lambda x: x,
                 average_q_decay=0.999, logger=getLogger(__name__),
                 batch_states=batch_states):

        self.shared_q_function = q_function
        self.target_q_function = copy.deepcopy(q_function)
        self.q_function = copy.deepcopy(self.shared_q_function)

        async_.assert_params_not_shared(
            self.shared_q_function, self.q_function)

        self.optimizer = optimizer

        self.t_max = t_max
        self.gamma = gamma
        self.explorer = explorer
        self.i_target = i_target
        self.phi = phi
        self.logger = logger
        self.average_q_decay = average_q_decay
        self.batch_states = batch_states

        self.t_global = mp.Value('l', 0)
        self.t = 0
        self.t_start = 0
        self.past_action_values = {}
        self.past_states = {}
        self.past_rewards = {}
        self.average_q = 0
Ejemplo n.º 2
0
    def __init__(self,
                 model,
                 optimizer,
                 t_max,
                 gamma,
                 beta=1e-2,
                 process_idx=0,
                 phi=lambda x: x,
                 pi_loss_coef=1.0,
                 v_loss_coef=0.5,
                 keep_loss_scale_same=False,
                 normalize_grad_by_t_max=False,
                 use_average_reward=False,
                 average_reward_tau=1e-2,
                 act_deterministically=False,
                 average_entropy_decay=0.999,
                 average_value_decay=0.999,
                 batch_states=batch_states):

        assert isinstance(model, A3CModel)
        # Globally shared model
        self.shared_model = model

        # Thread specific model
        self.model = copy.deepcopy(self.shared_model)
        async_.assert_params_not_shared(self.shared_model, self.model)

        self.optimizer = optimizer

        self.t_max = t_max
        self.gamma = gamma
        self.beta = beta
        self.phi = phi
        self.pi_loss_coef = pi_loss_coef
        self.v_loss_coef = v_loss_coef
        self.keep_loss_scale_same = keep_loss_scale_same
        self.normalize_grad_by_t_max = normalize_grad_by_t_max
        self.use_average_reward = use_average_reward
        self.average_reward_tau = average_reward_tau
        self.act_deterministically = act_deterministically
        self.average_value_decay = average_value_decay
        self.average_entropy_decay = average_entropy_decay
        self.batch_states = batch_states

        self.t = 0
        self.t_start = 0
        self.past_action_log_prob = {}
        self.past_action_entropy = {}
        self.past_states = {}
        self.past_rewards = {}
        self.past_values = {}
        self.average_reward = 0
        # A3C won't use a explorer, but this arrtibute is referenced by run_dqn
        self.explorer = None

        # Stats
        self.average_value = 0
        self.average_entropy = 0
Ejemplo n.º 3
0
    def __init__(self,
                 model,
                 optimizer,
                 replay_buffer=None,
                 t_max=None,
                 gamma=0.99,
                 tau=1e-2,
                 phi=lambda x: x,
                 pi_loss_coef=1.0,
                 v_loss_coef=0.5,
                 rollout_len=10,
                 batchsize=1,
                 disable_online_update=False,
                 n_times_replay=1,
                 replay_start_size=10**2,
                 normalize_loss_by_steps=True,
                 act_deterministically=False,
                 average_loss_decay=0.999,
                 average_entropy_decay=0.999,
                 average_value_decay=0.999,
                 explorer=None,
                 logger=None,
                 batch_states=batch_states,
                 backprop_future_values=True,
                 train_async=False):

        if train_async:
            # Globally shared model
            self.shared_model = model

            # Thread specific model
            self.model = copy.deepcopy(self.shared_model)
            async_.assert_params_not_shared(self.shared_model, self.model)
        else:
            self.model = model
        self.xp = self.model.xp

        self.optimizer = optimizer

        self.replay_buffer = replay_buffer
        self.t_max = t_max
        self.gamma = gamma
        self.tau = tau
        self.phi = phi
        self.pi_loss_coef = pi_loss_coef
        self.v_loss_coef = v_loss_coef
        self.rollout_len = rollout_len
        if not self.xp.isscalar(batchsize):
            batchsize = self.xp.int32(batchsize)
            """Fix Chainer Issue #2807

            batchsize should (look to) be scalar.
            """
        self.batchsize = batchsize
        self.normalize_loss_by_steps = normalize_loss_by_steps
        self.act_deterministically = act_deterministically
        self.disable_online_update = disable_online_update
        self.n_times_replay = n_times_replay
        self.replay_start_size = replay_start_size
        self.average_loss_decay = average_loss_decay
        self.average_value_decay = average_value_decay
        self.average_entropy_decay = average_entropy_decay
        self.logger = logger if logger else getLogger(__name__)
        self.batch_states = batch_states
        self.backprop_future_values = backprop_future_values
        self.train_async = train_async

        self.t = 0
        self.last_state = None
        self.last_action = None
        self.explorer = explorer
        self.online_batch_losses = []

        # Stats
        self.average_loss = 0
        self.average_value = 0
        self.average_entropy = 0

        self.init_history_data_for_online_update()
Ejemplo n.º 4
0
    def __init__(self,
                 model,
                 optimizer,
                 t_max,
                 gamma,
                 replay_buffer,
                 beta=1e-2,
                 phi=lambda x: x,
                 pi_loss_coef=1.0,
                 Q_loss_coef=0.5,
                 use_trust_region=True,
                 trust_region_alpha=0.99,
                 trust_region_delta=1,
                 truncation_threshold=10,
                 disable_online_update=False,
                 n_times_replay=8,
                 replay_start_size=10**4,
                 normalize_loss_by_steps=True,
                 act_deterministically=False,
                 use_Q_opc=False,
                 average_entropy_decay=0.999,
                 average_value_decay=0.999,
                 average_kl_decay=0.999,
                 logger=None):

        # Globally shared model
        self.shared_model = model

        # Globally shared average model used to compute trust regions
        self.shared_average_model = copy.deepcopy(self.shared_model)

        # Thread specific model
        self.model = copy.deepcopy(self.shared_model)
        async_.assert_params_not_shared(self.shared_model, self.model)

        self.optimizer = optimizer

        self.replay_buffer = replay_buffer
        self.t_max = t_max
        self.gamma = gamma
        self.beta = beta
        self.phi = phi
        self.pi_loss_coef = pi_loss_coef
        self.Q_loss_coef = Q_loss_coef
        self.normalize_loss_by_steps = normalize_loss_by_steps
        self.act_deterministically = act_deterministically
        self.use_trust_region = use_trust_region
        self.trust_region_alpha = trust_region_alpha
        self.truncation_threshold = truncation_threshold
        self.trust_region_delta = trust_region_delta
        self.disable_online_update = disable_online_update
        self.n_times_replay = n_times_replay
        self.use_Q_opc = use_Q_opc
        self.replay_start_size = replay_start_size
        self.average_value_decay = average_value_decay
        self.average_entropy_decay = average_entropy_decay
        self.average_kl_decay = average_kl_decay
        self.logger = logger if logger else getLogger(__name__)

        self.t = 0
        self.last_state = None
        self.last_action = None
        # ACER won't use a explorer, but this arrtibute is referenced by
        # run_dqn
        self.explorer = None

        # Stats
        self.average_value = 0
        self.average_entropy = 0
        self.average_kl = 0

        self.init_history_data_for_online_update()
Ejemplo n.º 5
0
    def __init__(self,
                 generator,
                 discriminator,
                 gen_optimizer,
                 dis_optimizer,
                 dataset,
                 conditional,
                 reward_mode,
                 imsize,
                 max_episode_steps,
                 rollout_n,
                 gamma,
                 beta,
                 gp_lambda,
                 lambda_R,
                 staying_penalty,
                 empty_drawing_penalty,
                 n_save_final_obs_interval,
                 outdir,
                 act_deterministically=False,
                 average_entropy_decay=0.999,
                 average_value_decay=0.999,
                 process_idx=0,
                 pi_loss_coef=1.0,
                 v_loss_coef=1.0):

        # globally shared model
        self.shared_generator = generator
        self.shared_discriminator = discriminator

        # process specific model
        self.generator = copy.deepcopy(self.shared_generator)
        async_.assert_params_not_shared(self.shared_generator, self.generator)

        self.discriminator = copy.deepcopy(self.shared_discriminator)
        async_.assert_params_not_shared(self.shared_discriminator,
                                        self.discriminator)

        self.gen_optimizer = gen_optimizer
        self.dis_optimizer = dis_optimizer
        self.dataset = dataset
        self.conditional = conditional

        assert reward_mode in ('l2', 'dcgan', 'wgangp')
        self.reward_mode = reward_mode

        self.imsize = imsize
        self.max_episode_steps = max_episode_steps
        self.rollout_n = rollout_n
        self.gamma = gamma
        self.beta = beta
        self.gp_lambda = gp_lambda
        self.lambda_R = lambda_R
        self.staying_penalty = staying_penalty
        self.empty_drawing_penalty = empty_drawing_penalty
        self.n_save_final_obs_interval = n_save_final_obs_interval
        self.outdir = outdir
        self.act_deterministically = act_deterministically
        self.average_entropy_decay = average_entropy_decay
        self.average_value_decay = average_value_decay
        self.pi_loss_coef = pi_loss_coef
        self.v_loss_coef = v_loss_coef

        self.observation_saver = ObservationSaver(self.outdir, self.rollout_n,
                                                  self.imsize)

        # initialize stat
        self.stat_average_value = 0.0
        self.stat_average_entropy = 0.0
        self.update_n = 0  # number of updates

        self.__reset_flags()
        self.__reset_buffers()
        self.__reset_stats()