def __init__(self, name, state_length, network_config, reinforce_config, feature_len, combine_decomposed_func, is_sigmoid=False, memory_resotre=True): super(SADQ_GQF, self).__init__() self.name = name #self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = ReplayBuffer_decom(self.reinforce_config.memory_size) self.learning = True self.explanation = False self.state_length = state_length self.features = 0 self.feature_len = feature_len # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.feature_len = feature_len self.features = None self.reset() self.memory_resotre = memory_resotre reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.eval_model = feature_q_model(name, state_length, self.feature_len, self.network_config.output_shape, network_config) self.target_model = feature_q_model(name, state_length, self.feature_len, self.network_config.output_shape, network_config) # self.target_model.eval_mode() self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon)
def __init__(self, name, state_length, network_config, reinforce_config, reward_num, combine_decomposed_func, memory_resotre=True): super(SADQAdaptive, self).__init__() self.name = name #self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config if self.reinforce_config.use_prior_memory: self.memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) else: self.memory = ReplayBuffer(self.reinforce_config.memory_size) self.learning = True self.state_length = state_length # Global self.steps = 0 self.best_reward_mean = 0 self.episode = 0 self.combine_decomposed_reward = combine_decomposed_func self.reward_num = reward_num self.reset() self.memory_resotre = memory_resotre reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda) # self.target_model.eval_mode() self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon)
def initialize(self): # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.max_timesteps self.beta_schedule = LinearSchedule( self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. # self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * self.max_timesteps), # initial_p=1.0, # final_p=self.exploration_final_eps) self.exploration = ConstantSchedule(self.exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() return 'initialize() complete'
def __init__(self, mem_queue, max_timesteps=1000000, buffer_size=50000, batch_size=32, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6): threading.Thread.__init__(self) self.mem_queue = mem_queue self.prioritized_replay = prioritized_replay self.batch_size = batch_size self.batch_idxes = None self.prioritized_replay_eps = prioritized_replay_eps # Create the replay buffer if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(buffer_size) self.beta_schedule = None
def __init__(self, is_chief, env, model, config, should_render=True): self.config = config self.is_chief = is_chief self.env = env self.should_render = should_render self.act, self.train, self.update_target, self.debug = multi_deepq.build_train( make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, gamma=config.gamma, optimizer=tf.train.AdamOptimizer(learning_rate=config.learning_rate), reuse=(not is_chief), ) self.max_iteraction_count = int(self.config.num_iterations) # Create the replay buffer self.replay_buffer = ReplayBuffer(config.replay_size) if self.config.exploration_schedule == "constant": self.exploration = ConstantSchedule(0.1) elif self.config.exploration_schedule == "linear": # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). self.exploration = LinearSchedule( schedule_timesteps=self.config.num_iterations / 4, initial_p=1.0, final_p=0.02) elif self.config.exploration_schedule == "piecewise": approximate_num_iters = self.config.num_iterations self.exploration = PiecewiseSchedule([ (0, 1.0), (approximate_num_iters / 50, 0.1), (approximate_num_iters / 5, 0.01) ], outside_value=0.01) else: raise ValueError("Bad exploration schedule")
def __init__(self, model, opt, learning=True): super().__init__() self.memory = PrioritizedReplayBuffer(100000, 0.6) self.previous_state = None self.previous_action = None self.previous_legal_actions = None self.step = 0 self.model_vae = model[0] self.model_dqn = model[1] self.model_dqn_target = model[2] self.opt_vae = opt[0] self.opt_dqn = opt[1] self.loss_vae = 0 self.loss_dqn = 0 self.batch_size = 32 self.max_tile = 0 self.totalCorrect = 0 self.total = 0 self.acc = 0 self.beta = 0.7 #self.test_q = 0 self.epsilon_schedule = LinearSchedule(500000, initial_p=0.99, final_p=0.01) self.learning = learning
def __init__(self, name, choices, network_config, reinforce_config): super(PGAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = Memory(self.reinforce_config.batch_size) self.steps = 0 self.total_reward = 0 self.previous_state = None self.previous_action = None self.clear_rewards() self.model = ActorModel(self.name + "_actor", self.network_config) self.summary = SummaryWriter( log_dir=self.reinforce_config.summaries_path + "/" + self.name) self.episode = 0 self.epsilon_schedule = LinearSchedule( 10 * 1000, initial_p=self.reinforce_config.starting_epsilon, final_p=0.1)
def __init__(self, env, network_policy, gamma=1.0, exploration_fraction=0.02, exploration_final_eps=0.01, steps_total=50000000, size_buffer=1000000, prioritized_replay=True, alpha_prioritized_replay=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, type_optimizer='Adam', lr=5e-4, eps=1.5e-4, time_learning_starts=20000, freq_targetnet_update=8000, freq_train=4, size_batch=32, callback=None, load_path=None, # for debugging device=torch.device("cuda" if torch.cuda.is_available() else "cpu"), seed=42, **network_kwargs): super(DQN, self).__init__(env, gamma, seed) self.create_replay_buffer(prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total) self.exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * steps_total), initial_p=1.0, final_p=exploration_final_eps) self.network_policy = network_policy # an instance of DQN_NETWORK, which contains an instance of FEATURE_EXTRACTOR and 1 additional head self.optimizer = eval('optim.%s' % type_optimizer)(self.network_policy.parameters(), lr=lr, eps=eps) # initialize target network self.network_target = copy.deepcopy(self.network_policy) for param in self.network_target.parameters(): param.requires_grad = False self.network_target.eval() self.size_batch = size_batch self.time_learning_starts = time_learning_starts self.freq_train = freq_train self.freq_targetnet_update = freq_targetnet_update self.t, self.steps_total = 0, steps_total self.device = device self.step_last_print, self.time_last_print = 0, None
def __init__(self, config, env_creator): self.config = config self.local_timestep = 0 self.episode_rewards = [0.0] self.episode_lengths = [0.0] if "cartpole" in self.config["env_config"]: self.env = env_creator(self.config["env_config"]) else: self.env = wrap_deepmind( env_creator(self.config["env_config"]), clip_rewards=False, frame_stack=True, scale=True) self.obs = self.env.reset() self.sess = U.make_session() self.sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = self.env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) if "cartpole" in self.config["env_config"]: q_func = models.mlp([64]) else: q_func = models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, ) act, self.train, self.update_target, debug = build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=self.env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=self.config["lr"]), gamma=self.config["gamma"], grad_norm_clipping=10, param_noise=False ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': self.env.action_space.n, } self.act = ActWrapper(act, act_params) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(self.config["exploration_fraction"] * self.config["schedule_max_timesteps"]), initial_p=1.0, final_p=self.config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target()
def __init__(self, identifier, actions, observation_shape, num_steps, x=0.0, y=0.0): self.id = identifier self.actions = actions self.x = x self.y = y self.yellow_steps = 0 self.postponed_action = None self.obs = None self.current_action = None self.weights = np.ones(32) self.td_errors = np.ones(32) self.pre_train = 2500 self.prioritized = False self.prioritized_eps = 1e-4 self.batch_size = 32 self.buffer_size = 30000 self.learning_freq = 500 self.target_update = 5000 # Create all the functions necessary to train the model self.act, self.train, self.update_target, self.debug = deepq.build_train( make_obs_ph=lambda name: TrafficTfInput(observation_shape, name=name), q_func=dueling_model, num_actions=len(actions), optimizer=tf.train.AdamOptimizer(learning_rate=1e-4, epsilon=1e-4), gamma=0.99, double_q=True, scope="deepq" + identifier ) # Create the replay buffer if self.prioritized: self.replay_buffer = PrioritizedReplayBuffer(size=self.buffer_size, alpha=0.6) self.beta_schedule = LinearSchedule(num_steps // 4, initial_p=0.4, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). self.exploration = LinearSchedule(schedule_timesteps=int(num_steps * 0.1), initial_p=1.0, final_p=0.01) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target()
def __init__(self, name, choices, network_config, reinforce_config): super(DQNAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) self.learning = True self.explanation = False # Global self.steps = 0 self.reward_history = [] self.episode_time_history = [] self.best_reward_mean = -maxsize self.episode = 0 self.reset() reinforce_summary_path = self.reinforce_config.summaries_path + "/" + self.name if not self.network_config.restore_network: clear_summary_path(reinforce_summary_path) else: self.restore_state() self.summary = SummaryWriter(log_dir=reinforce_summary_path) self.target_model = DQNModel(self.name + "_target", self.network_config, use_cuda) self.eval_model = DQNModel(self.name + "_eval", self.network_config, use_cuda) self.beta_schedule = LinearSchedule( self.reinforce_config.beta_timesteps, initial_p=self.reinforce_config.beta_initial, final_p=self.reinforce_config.beta_final) self.epsilon_schedule = LinearSchedule( self.reinforce_config.epsilon_timesteps, initial_p=self.reinforce_config.starting_epsilon, final_p=self.reinforce_config.final_epsilon)
def __init__(self, logdir, replay_alpha=0.6, replay_beta=0.4, t_beta_max=int(1e7), **kwargs): """Init.""" super().__init__(logdir, **kwargs) self.buffer = PrioritizedReplayBuffer(self.buffer, alpha=replay_alpha) self.data_manager.buffer = self.buffer self.beta_schedule = LinearSchedule(t_beta_max, 1.0, replay_beta)
def create_replay_buffer(self, prioritized_replay, prioritized_replay_eps, size_buffer, alpha_prioritized_replay, prioritized_replay_beta0, prioritized_replay_beta_iters, steps_total): self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps if prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(size_buffer, alpha=alpha_prioritized_replay) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = steps_total self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(size_buffer) self.beta_schedule = None pass
def _build_replay_buffer(self): # Create the replay buffer if self.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(self.memory_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: self.prioritized_replay_beta_iters = self.prioritized_replay_iter self.beta_schedule = LinearSchedule(self.prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(self.memory_size) self.beta_schedule = None return replay_buffer
def __init__(self, index, is_chief, env, model, queue, config, logger, episode_logger, should_render=False): self.config = config self.is_chief = is_chief self.env = env self.global_step = tf.train.get_global_step() self.should_render = should_render self.logger = logger self.episode_logger = episode_logger self.log_frequency = 10 with tf.device('/cpu:0'): self.act, self.update_params, self.debug = qdqn.build_act( make_obs_ph=lambda name: U.Uint8Input(self.env.observation_space.shape, name=name), q_func=model, num_actions=self.env.action_space.n, scope="actor_{}".format(index), learner_scope="learner", reuse=False) with tf.device('/cpu:0'): obs_t_input = tf.placeholder(tf.uint8, self.env.observation_space.shape, name="obs_t") act_t_ph = tf.placeholder(tf.int32, self.env.action_space.shape, name="action") rew_t_ph = tf.placeholder(tf.float32, [], name="reward") obs_tp1_input = tf.placeholder(tf.uint8, self.env.observation_space.shape, name="obs_tp1") done_mask_ph = tf.placeholder(tf.float32, [], name="done") global_step_ph = tf.placeholder(tf.int32, [], name="sample_global_step") enqueue_op = queue.enqueue( [obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, global_step_ph]) self.enqueue = U.function( [obs_t_input, act_t_ph, rew_t_ph, obs_tp1_input, done_mask_ph, global_step_ph], enqueue_op) self.max_iteration_count = self.config.num_iterations if self.config.exploration_schedule == "constant": self.exploration = ConstantSchedule(0.1) elif self.config.exploration_schedule == "linear": # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). self.exploration = LinearSchedule( schedule_timesteps=self.config.num_iterations / 4, initial_p=1.0, final_p=0.02) elif self.config.exploration_schedule == "piecewise": approximate_num_iters = self.config.num_iterations self.exploration = PiecewiseSchedule([ (0, 1.0), (approximate_num_iters / 50, 0.1), (approximate_num_iters / 5, 0.01) ], outside_value=0.01) else: raise ValueError("Bad exploration schedule")
def make_replay_buffer(self): if self.config["prioritized_replay"]: self.replay_buffer = PrioritizedReplayBuffer( self.config["buffer_size"], alpha=self.config["prioritized_replay_alpha"]) if self.config["prioritized_replay_beta_iters"] is None: self.config["prioritized_replay_beta_iters"] = self.config[ "max_timesteps"] self.beta_schedule = LinearSchedule( self.config["prioritized_replay_beta_iters"], initial_p=self.config["prioritized_replay_beta0"], final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.config["buffer_size"]) self.beta_schedule = None
def __init__(self, size, alpha, epsilon, timesteps, initial_p, final_p): super(DoublePrioritizedReplayBuffer, self).__init__(size) assert alpha > 0 self._alpha = alpha self._epsilon = epsilon self._beta_schedule = LinearSchedule(timesteps, initial_p=initial_p, final_p=final_p) it_capacity = 1 while it_capacity < size: it_capacity *= 2 self._it_sum = SumSegmentTree(it_capacity) self._it_min = MinSegmentTree(it_capacity) self._max_priority = 1.0 self._it_sum2 = SumSegmentTree(it_capacity) self._it_min2 = MinSegmentTree(it_capacity) self._max_priority2 = 1.0
def learn(env, args): ob = env.reset() ob_shape = ob.shape num_action = int(env.action_space.n) agent = TestAgent(ob_shape, num_action, args) replay_buffer = PrioritizedReplayBuffer(args.buffer_size, alpha=args.prioritized_replay_alpha) args.prioritized_replay_beta_iters = args.max_timesteps beta_schedule = LinearSchedule(args.prioritized_replay_beta_iters, initial_p=args.prioritized_replay_beta0, final_p=1.0) episode_rewards = [0.0] saved_mean_reward = None n_step_seq = [] agent.sample_noise() agent.update_target() for t in range(args.max_timesteps): action = agent.act(ob) new_ob, rew, done, _ = env.step(action) replay_buffer.add(ob, action, rew, new_ob, float(done)) ob = new_ob episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > args.learning_starts and t % args.replay_period == 0: experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(t)) (obs, actions, rewards, obs_next, dones, weights, batch_idxes) = experience agent.sample_noise() kl_errors = agent.update(obs, actions, rewards, obs_next, dones, weights) replay_buffer.update_priorities(batch_idxes, np.abs(kl_errors) + 1e-6) if t > args.learning_starts and t % args.target_network_update_freq == 0: agent.update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and args.print_freq is not None and len(episode_rewards) % args.print_freq == 0: print('steps {} episodes {} mean reward {}'.format(t, num_episodes, mean_100ep_reward))
def __init__(self, model, opt, learning=True): super().__init__() self.memory = ReplayBuffer(3000) self.previous_state = None self.previous_action = None self.previous_legal_actions = None self.step = 0 self.model = model self.opt = opt self.loss = 0 self.batch_size = 10 self.test_q = 0 self.max_tile = 0 #self.test_q = 0 self.epsilon_schedule = LinearSchedule(1000000, initial_p=0.99, final_p=0.01) self.learning = learning
def __init__(self, statesize, actionsize, heros, update_target_period=100, scope="deepq", initial_p=1.0, final_p=0.02): self.act = None self.train = None self.update_target = None self.debug = None self.state_size = statesize self.action_size = actionsize # 50=8*mov+10*attack+10*skill1+10*skill2+10*skill3+回城+hold self.memory = PrioritizedReplayBuffer(500000, alpha=0.6) self.gamma = 0.9 # discount rate self.epsilon = 1.0 # exploration rate self.e_decay = .99 self.e_min = 0.05 self.learning_rate = 0.01 self.heros = heros self.scope = scope self.model = self._build_model # todo:英雄1,2普攻距离为2,后续需修改 self.att_dist = 2 self.act_times = 0 self.train_times = 0 self.update_target_period = update_target_period self.exploration = LinearSchedule(schedule_timesteps=3000, initial_p=initial_p, final_p=final_p) self.battle_rewards = [] self.loss = []
def __init__(self, name, choices, reward_types, network_config, reinforce_config): super(HRAAdaptive, self).__init__() self.name = name self.choices = choices self.network_config = network_config self.reinforce_config = reinforce_config self.update_frequency = reinforce_config.update_frequency self.replay_memory = PrioritizedReplayBuffer( self.reinforce_config.memory_size, 0.6) self.learning = True self.explanation = False self.steps = 0 self.previous_state = None self.previous_action = None self.reward_types = reward_types self.clear_rewards() self.total_reward = 0 self.eval_model = HRAModel(self.name + "_eval", self.network_config) self.target_model = HRAModel(self.name + "_target", self.network_config) clear_summary_path(self.reinforce_config.summaries_path + "/" + self.name) self.summary = SummaryWriter( log_dir=self.reinforce_config.summaries_path + "/" + self.name) self.episode = 0 self.beta_schedule = LinearSchedule(10 * 1000, initial_p=0.2, final_p=1.0)
def learn(env, network, seed=None, lr=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, load_path=None, **network_kwargs): """Train a deepq model. Parameters ------- env: gym.Env environment to train on network: string or a function neural network to use as a q function approximator. If string, has to be one of the names of registered models in baselines.common.models (mlp, cnn, conv_only). If a function, should take an observation tensor and return a latent variable tensor, which will be mapped to the Q function heads (see build_q_func in baselines.deepq.models for details on that) seed: int or None prng seed. The runs with the same seed "should" give the same results. If None, no seeding is used. lr: float learning rate for adam optimizer total_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to total_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. param_noise: bool whether or not to use parameter space noise (https://arxiv.org/abs/1706.01905) callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. load_path: str path to load the model from. (default: None) **network_kwargs additional keyword arguments to pass to the network builder. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = get_session() set_global_seeds(seed) q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space = env.observation_space def make_obs_ph(name): return ObservationInput(observation_space, name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * total_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: td = checkpoint_path or td model_file = os.path.join(td, "model") model_saved = False if tf.train.latest_checkpoint(td) is not None: load_variables(model_file) logger.log('Loaded model from {}'.format(model_file)) model_saved = True elif load_path is not None: load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) for t in range(total_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) save_variables(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) load_variables(model_file) return act, debug['q_func'], debug['obs']
with U.make_session(8): # Create the environment env = gym.make("CartPole-v0") # Create all the functions necessary to train the model act, train, update_target, debug = deepq.build_train( make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name), q_func=model, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), param_noise=False ) # Create the replay buffer replay_buffer = PrioritizedReplayBuffer(50000, alpha=0.6) # Create the schedule for exploration starting from 1 (every action is random) down to # 0.02 (98% of actions are selected according to values predicted by the model). exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) # Initialize the parameters and copy them to the target network. U.initialize() update_target() tvars = tf.trainable_variables() tvars_vals = U.get_session().run(tvars) for var, val in zip(tvars, tvars_vals): print(var.name, val) episode_rewards = [0.0] loss_array = [] obs = env.reset() for t in itertools.count():
def learn(env, q_func, num_actions=3, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, param_noise=False, param_noise_threshold=0.05, callback=None, demo_replay=[] ): """Train a deepq model. Parameters ------- env: pysc2.env.SC2Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((64, 64), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=num_actions, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10 ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': num_actions, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() # Select all marines first player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative obs = common.init(env, obs) group_id = 0 reset = True with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. if param_noise_threshold >= 0.: update_param_noise_threshold = param_noise_threshold else: # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(num_actions)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True # custom process for DefeatZerglingsAndBanelings obs, screen, player = common.select_marine(env, obs) action = act(np.array(screen)[None], update_eps=update_eps, **kwargs)[0] reset = False rew = 0 new_action = None obs, new_action = common.marine_action(env, obs, player, action) army_count = env._obs.observation.player_common.army_count try: if army_count > 0 and _ATTACK_SCREEN in obs[0].observation["available_actions"]: obs = env.step(actions=new_action) else: new_action = [sc2_actions.FunctionCall(_NO_OP, [])] obs = env.step(actions=new_action) except Exception as e: #print(e) 1 # Do nothing player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] new_screen = player_relative rew += obs[0].reward done = obs[0].step_type == environment.StepType.LAST selected = obs[0].observation["screen"][_SELECTED] player_y, player_x = (selected == _PLAYER_FRIENDLY).nonzero() if(len(player_y)>0): player = [int(player_x.mean()), int(player_y.mean())] if(len(player) == 2): if(player[0]>32): new_screen = common.shift(LEFT, player[0]-32, new_screen) elif(player[0]<32): new_screen = common.shift(RIGHT, 32 - player[0], new_screen) if(player[1]>32): new_screen = common.shift(UP, player[1]-32, new_screen) elif(player[1]<32): new_screen = common.shift(DOWN, 32 - player[1], new_screen) # Store transition in the replay buffer. replay_buffer.add(screen, action, rew, new_screen, float(done)) screen = new_screen episode_rewards[-1] += rew reward = episode_rewards[-1] if done: print("Episode Reward : %s" % episode_rewards[-1]) obs = env.reset() player_relative = obs[0].observation["screen"][_PLAYER_RELATIVE] screen = player_relative group_list = common.init(env, obs) # Select all marines first #env.step(actions=[sc2_actions.FunctionCall(_SELECT_UNIT, [_SELECT_ALL])]) episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("reward", reward) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) U.load_state(model_file) return ActWrapper(act)
def __init__( self, env, # observation_space, # action_space, network=None, scope='deepq', seed=None, lr=None, # Was 5e-4 lr_mc=5e-4, total_episodes=None, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=None, # was 0.02 train_freq=1, train_log_freq=100, batch_size=32, print_freq=100, checkpoint_freq=10000, # checkpoint_path=None, learning_starts=1000, gamma=None, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, save_path=None, load_path=None, save_reward_threshold=None, **network_kwargs): super().__init__(env, seed) if train_log_freq % train_freq != 0: raise ValueError( 'Train log frequency should be a multiple of train frequency') elif checkpoint_freq % train_log_freq != 0: raise ValueError( 'Checkpoint freq should be a multiple of train log frequency, or model saving will not be logged properly' ) print('init dqnlearningagent') self.train_log_freq = train_log_freq self.scope = scope self.learning_starts = learning_starts self.save_reward_threshold = save_reward_threshold self.batch_size = batch_size self.train_freq = train_freq self.total_episodes = total_episodes self.total_timesteps = total_timesteps # TODO: scope not doing anything. if network is None and 'lunar' in env.unwrapped.spec.id.lower(): if lr is None: lr = 1e-3 if exploration_final_eps is None: exploration_final_eps = 0.02 #exploration_fraction = 0.1 #exploration_final_eps = 0.02 target_network_update_freq = 1500 #print_freq = 100 # num_cpu = 5 if gamma is None: gamma = 0.99 network = 'mlp' network_kwargs = { 'num_layers': 2, 'num_hidden': 64, } self.target_network_update_freq = target_network_update_freq self.gamma = gamma get_session() # set_global_seeds(seed) # TODO: Check whether below is ok to substitue for set_global_seeds. try: import tensorflow as tf tf.set_random_seed(seed) except ImportError: pass self.q_func = build_q_func(network, **network_kwargs) # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) act, self.train, self.train_mc, self.update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=self.q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), optimizer_mc=tf.train.AdamOptimizer(learning_rate=lr_mc), gamma=gamma, grad_norm_clipping=10, param_noise=False, scope=scope, # reuse=reuse, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': self.q_func, 'num_actions': env.action_space.n, } self._act = ActWrapper(act, act_params) self.print_freq = print_freq self.checkpoint_freq = checkpoint_freq # Create the replay buffer self.prioritized_replay = prioritized_replay self.prioritized_replay_eps = prioritized_replay_eps if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer( buffer_size, alpha=prioritized_replay_alpha, ) if prioritized_replay_beta_iters is None: if total_episodes is not None: raise NotImplementedError( 'Need to check how to set exploration based on episodes' ) prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0, ) else: self.replay_buffer = ReplayBuffer(buffer_size) self.replay_buffer_mc = ReplayBuffer(buffer_size) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int( exploration_fraction * total_timesteps if total_episodes is None else total_episodes), initial_p=1.0, final_p=exploration_final_eps, ) # Initialize the parameters and copy them to the target network. U.initialize() self.update_target() self.episode_lengths = [0] self.episode_rewards = [0.0] self.discounted_episode_rewards = [0.0] self.start_values = [None] self.lunar_crashes = [0] self.lunar_goals = [0] self.saved_mean_reward = None self.td = None if save_path is None: self.td = tempfile.mkdtemp() outdir = self.td self.model_file = os.path.join(outdir, "model") else: outdir = os.path.dirname(save_path) os.makedirs(outdir, exist_ok=True) self.model_file = save_path print('DQN agent saving to:', self.model_file) self.model_saved = False if tf.train.latest_checkpoint(outdir) is not None: # TODO: Check scope addition load_variables(self.model_file, scope=self.scope) # load_variables(self.model_file) logger.log('Loaded model from {}'.format(self.model_file)) self.model_saved = True raise Exception('Check that we want to load previous model') elif load_path is not None: # TODO: Check scope addition load_variables(load_path, scope=self.scope) # load_variables(load_path) logger.log('Loaded model from {}'.format(load_path)) self.train_log_file = None if save_path and load_path is None: self.train_log_file = self.model_file + '.log.csv' with open(self.train_log_file, 'w') as f: cols = [ 'episode', 't', 'td_max', 'td_mean', '100ep_r_mean', '100ep_r_mean_discounted', '100ep_v_mean', '100ep_n_crashes_mean', '100ep_n_goals_mean', 'saved_model', 'smoothing', ] f.write(','.join(cols) + '\n') self.training_episode = 0 self.t = 0 self.episode_t = 0 """ n = observation_space.n m = action_space.n self.Q = np.zeros((n, m)) self._lr_schedule = lr_schedule self._eps_schedule = eps_schedule self._boltzmann_schedule = boltzmann_schedule """ # Make placeholder for Q values self.q_values = debug['q_values']
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.01, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=50, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, num_cpu=16, callback=None, num_optimisation_steps=40): """Train a deepq model. Parameters ------- env : gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. num_cpu: int number of cpus to use for training callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = U.make_session(num_cpu=num_cpu) sess.__enter__() def make_obs_ph(name): return U.BatchInput((env.observation_space.shape[0] * 2, ), name=name) act, train, update_target, debug = deepq.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_max_rewards = [env.reward_max] episode_rewards = [0.0] saved_mean_reward_diff = None # difference in saved reward obs = env.reset(seed=np.random.randint(0, 1000)) with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") episode_buffer = [None] * env.n episode_timestep = 0 for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value action = act(np.concatenate([obs, env.goal])[None], update_eps=exploration.value(t))[0] new_obs, rew, done, _ = env.step(action) # Store transition in the replay buffer. episode_buffer[episode_timestep] = (obs, action, rew, new_obs, float(done)) episode_timestep += 1 replay_buffer.add(np.concatenate([obs, env.goal]), action, rew, np.concatenate([new_obs, env.goal]), float(done)) obs = new_obs episode_rewards[-1] += rew num_episodes = len(episode_rewards) #######end of episode if done: for episode in range(episode_timestep): obs1, action1, _, new_obs1, done1 = episode_buffer[episode] goal_prime = new_obs1 rew1 = env.calculate_reward(new_obs1, goal_prime) replay_buffer.add(np.concatenate([obs1, goal_prime]), action1, rew1, np.concatenate([new_obs1, goal_prime]), float(done1)) episode_timestep = 0 obs = env.reset(seed=np.random.randint(0, 1000)) episode_rewards.append(0.0) episode_max_rewards.append(env.reward_max) #############Training Q if t > learning_starts and num_episodes % train_freq == 0: for i in range(num_optimisation_steps): # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs( td_errors) + prioritized_replay_eps replay_buffer.update_priorities( batch_idxes, new_priorities) #############Training Q target if t > learning_starts and num_episodes % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = np.mean(episode_rewards[-101:-1]) mean_100ep_max_reward = np.mean(episode_max_rewards[-101:-1]) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("mean 100 episode max reward", mean_100ep_max_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and num_episodes % checkpoint_freq == 0): if saved_mean_reward_diff is None or mean_100ep_max_reward - mean_100ep_reward < saved_mean_reward_diff: if print_freq is not None: logger.log( "Saving model due to mean reward difference decrease: {} -> {}" .format(saved_mean_reward_diff, mean_100ep_max_reward - mean_100ep_reward)) U.save_state(model_file) model_saved = True saved_mean_reward_diff = mean_100ep_max_reward - mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward_diff)) U.load_state(model_file) return ActWrapper(act, act_params)
def learn(env, q_func, beta1=0.9, beta2=0.999, epsilon=1e-8, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, exploration_schedule=None, start_lr=5e-4, end_lr=5e-4, start_step=0, end_step=1, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, param_noise=False, callback=None, model_directory=None, lamda=0.1): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer beta1: float beta1 parameter for adam beta2: float beta2 parameter for adam epsilon: float epsilon parameter for adam max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability exploration_schedule: Schedule a schedule for exploration chance train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model sess = tf.Session() sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return ObservationInput(env.observation_space, name=name) global_step = tf.Variable(0, trainable=False) lr = interpolated_decay(start_lr, end_lr, global_step, start_step, end_step) act, train, update_target, debug = multiheaded_build_graph.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr, beta1=beta1, beta2=beta2, epsilon=epsilon), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, global_step=global_step, lamda=lamda, ) tf.summary.FileWriter(logger.get_dir(), graph_def=sess.graph_def) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. if exploration_schedule is None: exploration = LinearSchedule(schedule_timesteps=int( exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) else: exploration = exploration_schedule # Initialize the parameters and copy them to the target network. U.initialize() update_target() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True with tempfile.TemporaryDirectory() as td: model_saved = False if model_directory is None: model_directory = pathlib.Path(td) model_file = str(model_directory / "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value( t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs[ 'update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] if isinstance(env.action_space, gym.spaces.MultiBinary): env_action = np.zeros(env.action_space.n) env_action[action] = 1 else: env_action = action reset = False new_obs, rew, done, _ = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample( batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. update_target() mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len( episode_rewards) % print_freq == 0: logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward: if print_freq is not None: logger.log( "Saving model due to mean reward increase: {} -> {}" .format(saved_mean_reward, mean_100ep_reward)) U.save_state(model_file) act.save(str(model_directory / "act_model.pkl")) model_saved = True saved_mean_reward = mean_100ep_reward if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format( saved_mean_reward)) U.load_state(model_file) return act
def learn(env, q_func, lr=5e-4, max_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=1, batch_size=32, print_freq=100, checkpoint_freq=10000, learning_starts=1000, gamma=1.0, target_network_update_freq=500, prioritized_replay=False, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-6, test_agent=1e6, param_noise=False, double=True, lambda_double=False, lam=0.2, targets=1, piecewise_schedule=False, callback=None): """Train a deepq model. Parameters ------- env: gym.Env environment to train on q_func: (tf.Variable, int, str, bool) -> tf.Variable the model that takes the following inputs: observation_in: object the output of observation placeholder num_actions: int number of actions scope: str reuse: bool should be passed to outer variable scope and returns a tensor of shape (batch_size, num_actions) with values of every action. lr: float learning rate for adam optimizer max_timesteps: int number of env steps to optimizer for buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every `train_freq` steps. set to None to disable printing batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to save the model. This is so that the best version is restored at the end of the training. If you do not wish to restore the best version at the end of the training set this variable to None. learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. prioritized_replay: True if True prioritized replay buffer will be used. prioritized_replay_alpha: float alpha parameter for prioritized replay buffer prioritized_replay_beta0: float initial value of beta for prioritized replay buffer prioritized_replay_beta_iters: int number of iterations over which beta will be annealed from initial value to 1.0. If set to None equals to max_timesteps. prioritized_replay_eps: float epsilon to add to the TD errors when updating priorities. callback: (locals, globals) -> None function called at every steps with state of the algorithm. If callback returns true training stops. Returns ------- act: ActWrapper Wrapper over act function. Adds ability to save it and load it. See header of baselines/deepq/categorical.py for details on the act function. """ # Create all the functions necessary to train the model config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.__enter__() # capture the shape outside the closure so that the env object is not serialized # by cloudpickle when serializing make_obs_ph observation_space_shape = env.observation_space.shape def make_obs_ph(name): return BatchInput(observation_space_shape, name=name) act, train, update_target, debug = deepq_base.build_train( make_obs_ph=make_obs_ph, q_func=q_func, num_actions=env.action_space.n, optimizer=tf.train.AdamOptimizer(learning_rate=lr), gamma=gamma, grad_norm_clipping=10, param_noise=param_noise, double_q=double, lambda_double=lambda_double, lam=lam, targets=targets, ) act_params = { 'make_obs_ph': make_obs_ph, 'q_func': q_func, 'num_actions': env.action_space.n, } act = ActWrapper(act, act_params) # Create the replay buffer if prioritized_replay: replay_buffer = PrioritizedReplayBuffer(buffer_size, alpha=prioritized_replay_alpha) if prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = max_timesteps beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=prioritized_replay_beta0, final_p=1.0) else: replay_buffer = ReplayBuffer(buffer_size) beta_schedule = None # Create the schedule for exploration starting from 1. if piecewise_schedule: exploration = PiecewiseSchedule(endpoints=[(0,1.0),(1e6,exploration_final_eps),(24e6,0.01)], outside_value=0.01) else: exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction * max_timesteps), initial_p=1.0, final_p=exploration_final_eps) # Initialize the parameters and copy them to the target network. U.initialize() targets_seq = np.array([i for i in range(targets)],dtype=np.int32) targets_lam = lam ** targets_seq for target in range(targets): update_target[target]() episode_rewards = [0.0] saved_mean_reward = None obs = env.reset() reset = True epinfobuf = deque(maxlen=100) test_flag = False with tempfile.TemporaryDirectory() as td: model_saved = False model_file = os.path.join(td, "model") for t in range(max_timesteps): if callback is not None: if callback(locals(), globals()): break # Take action and update exploration to the newest value kwargs = {} if not param_noise: update_eps = exploration.value(t) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = -np.log(1. - exploration.value(t) + exploration.value(t) / float(env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True action = act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, info = env.step(env_action) # Store transition in the replay buffer. replay_buffer.add(obs, action, rew, new_obs, float(done)) obs = new_obs maybeepinfo = info.get('episode') if maybeepinfo: epinfobuf.extend([maybeepinfo]) episode_rewards[-1] += rew if done: obs = env.reset() episode_rewards.append(0.0) reset = True if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if prioritized_replay: experience = replay_buffer.sample(batch_size, beta=beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(batch_size) weights, batch_idxes = np.ones_like(rewards), None td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights, targets_lam) if prioritized_replay: new_priorities = np.abs(td_errors) + prioritized_replay_eps replay_buffer.update_priorities(batch_idxes, new_priorities) if t > learning_starts and t % target_network_update_freq == 0: # Update target network periodically. targets_seq = np.roll(targets_seq, 1) targets_lam = np.roll(targets_lam, -1) update_target[targets_seq[0]]() if t > learning_starts and t % test_agent == 0: test_flag = True if done and test_flag: nEpisodes = 50 rewards = deque(maxlen=nEpisodes) for i in range(nEpisodes): obs, done = env.reset(), False episode_rew = 0 reward = 0 maybeepinfo = None while maybeepinfo is None: obs, rew, done, info = env.step(act(obs[None], stochastic=True, update_eps=0.001)[0]) maybeepinfo = info.get('episode') if maybeepinfo: reward = maybeepinfo['r'] rewards.extend([reward]) # time.sleep(0.01) # print("Episode:", reward) logger.record_tabular("test_reward_mean", np.mean([rew for rew in rewards])) logger.record_tabular("steps", t) logger.dump_tabular() obs = env.reset() test_flag = False mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) num_episodes = len(episode_rewards) if done and print_freq is not None and len(episode_rewards) % print_freq == 0: mean_reward = safemean([epinfo['r'] for epinfo in epinfobuf]) logger.record_tabular("episode_reward_mean", mean_reward) logger.record_tabular("eplenmean" , safemean([epinfo['l'] for epinfo in epinfobuf])) logger.record_tabular("steps", t) logger.record_tabular("episodes", num_episodes) #logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) logger.dump_tabular() if (checkpoint_freq is not None and t > learning_starts and num_episodes > 100 and t % checkpoint_freq == 0): if saved_mean_reward is None or mean_reward > saved_mean_reward or ((mean_reward >= saved_mean_reward) and mean_reward > 0): if print_freq is not None: logger.log("Saving model due to mean reward increase: {} -> {}".format( saved_mean_reward, mean_reward)) save_state(model_file) model_saved = True saved_mean_reward = mean_reward act.save() if model_saved: if print_freq is not None: logger.log("Restored model with mean reward: {}".format(saved_mean_reward)) load_state(model_file) return act
def __init__(self, input_dims, buffer_size, hidden, layers, network_class, polyak, batch_size, Q_lr, pi_lr, norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, T, rollout_batch_size, subtract_goals, relative_goals, clip_pos_returns, clip_return, sample_transitions, gamma, temperature, prioritization, env_name, alpha, beta0, beta_iters, eps, max_timesteps, rank_method, reuse=False, **kwargs): """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER). Args: input_dims (dict of ints): dimensions for the observation (o), the goal (g), and the actions (u) buffer_size (int): number of transitions that are stored in the replay buffer hidden (int): number of units in the hidden layers layers (int): number of hidden layers network_class (str): the network class that should be used (e.g. 'baselines.her.ActorCritic') polyak (float): coefficient for Polyak-averaging of the target network batch_size (int): batch size for training Q_lr (float): learning rate for the Q (critic) network pi_lr (float): learning rate for the pi (actor) network norm_eps (float): a small value used in the normalizer to avoid numerical instabilities norm_clip (float): normalized inputs are clipped to be in [-norm_clip, norm_clip] max_u (float): maximum action magnitude, i.e. actions are in [-max_u, max_u] action_l2 (float): coefficient for L2 penalty on the actions clip_obs (float): clip observations before normalization to be in [-clip_obs, clip_obs] scope (str): the scope used for the TensorFlow graph T (int): the time horizon for rollouts rollout_batch_size (int): number of parallel rollouts per DDPG agent subtract_goals (function): function that subtracts goals from each other relative_goals (boolean): whether or not relative goals should be fed into the network clip_pos_returns (boolean): whether or not positive returns should be clipped clip_return (float): clip returns to be in [-clip_return, clip_return] sample_transitions (function) function that samples from the replay buffer gamma (float): gamma used for Q learning updates reuse (boolean): whether or not the networks should be reused """ if self.clip_return is None: self.clip_return = np.inf self.create_actor_critic = import_function(self.network_class) input_shapes = dims_to_shapes(self.input_dims) self.dimo = self.input_dims['o'] self.dimg = self.input_dims['g'] self.dimu = self.input_dims['u'] self.prioritization = prioritization self.env_name = env_name self.temperature = temperature self.rank_method = rank_method # Prepare staging area for feeding data to the model. stage_shapes = OrderedDict() for key in sorted(self.input_dims.keys()): if key.startswith('info_'): continue stage_shapes[key] = (None, *input_shapes[key]) for key in ['o', 'g']: stage_shapes[key + '_2'] = stage_shapes[key] stage_shapes['r'] = (None, ) stage_shapes['w'] = (None, ) self.stage_shapes = stage_shapes # Create network. with tf.variable_scope(self.scope): self.staging_tf = StagingArea( dtypes=[tf.float32 for _ in self.stage_shapes.keys()], shapes=list(self.stage_shapes.values())) self.buffer_ph_tf = [ tf.placeholder(tf.float32, shape=shape) for shape in self.stage_shapes.values() ] self.stage_op = self.staging_tf.put(self.buffer_ph_tf) self._create_network(reuse=reuse) # Configure the replay buffer. buffer_shapes = { key: (self.T if key != 'o' else self.T + 1, *input_shapes[key]) for key, val in input_shapes.items() } buffer_shapes['g'] = (buffer_shapes['g'][0], self.dimg) buffer_shapes['ag'] = (self.T + 1, self.dimg) buffer_size = (self.buffer_size // self.rollout_batch_size) * self.rollout_batch_size if self.prioritization == 'entropy': self.buffer = ReplayBufferEntropy(buffer_shapes, buffer_size, self.T, self.sample_transitions, self.prioritization, self.env_name) elif self.prioritization == 'tderror': self.buffer = PrioritizedReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions, alpha, self.env_name) if beta_iters is None: beta_iters = max_timesteps self.beta_schedule = LinearSchedule(beta_iters, initial_p=beta0, final_p=1.0) else: self.buffer = ReplayBuffer(buffer_shapes, buffer_size, self.T, self.sample_transitions)
def _get_boltzmann_q( self, final_boltzmann_parameter, optimal_q=None, q_learning_episodes=10000, n_test_episodes=100, max_tries=10, verbose=False, logging=True, ): """Compute Q values for a boltzmann policy. Args: optimal_q (Optional[[[float]]]): Q values to use for computing boltzmann Q values using policy evaluation. If None, learn Q values using boltzmann exploration instead. """ if optimal_q is not None: return self._get_boltzmann_q_policy_evaluation( boltzmann_parameter=final_boltzmann_parameter, optimal_q=optimal_q, ) print('RESOLVING!!!!!!') from .policies import TabularQLearningAgent agent = TabularQLearningAgent( action_space=self.env.action_space, observation_space=self.env.observation_space, eps_schedule=LinearSchedule( schedule_timesteps=int(0.9 * q_learning_episodes), initial_p=0, final_p=final_boltzmann_parameter, ), lr_schedule=LinearSchedule( schedule_timesteps=int(0.9 * q_learning_episodes), initial_p=1.0, final_p=0.02, ), ) for ep in range(q_learning_episodes): obs = self.env.reset() done = False cum_reward = 0 while not done: next_action = agent.act(obs, explore=True) obs1, reward, done, _ = self.env.step(next_action) cum_reward += reward agent.update( s=obs, a=next_action, s1=obs1, r=reward, done=done, ) obs = obs1 if verbose: print({ 'ep': ep, 'lr': agent._lr, 'eps': agent._eps, 'q_norm': np.linalg.norm(agent.Q), 'cum_reward': cum_reward, }) # Test learned agent. cum_rewards = [] for _ in range(n_test_episodes): obs = self.env.reset() done = False cum_reward = 0 while not done: next_action = agent.act(obs, explore=False) obs1, reward, done, _ = self.env.step(next_action) cum_reward += reward obs = obs1 cum_rewards.append(cum_reward) test_reward = np.mean(cum_rewards) print('Mean Boltzmann reward: {}'.format(test_reward)) return copy(agent.Q)