class Acer(): def __init__(self, runner, model, buffer, log_interval, curiosity): self.runner = runner self.model = model self.curiosity = curiosity self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None def call(self, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks, next_states, icm_actions = runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) # print("obs shape {} , next obs shape {} ".format(np.shape(obs),np.shape(next_states))) # next_states = next_states.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) if self.curiosity and on_policy: if on_policy : icm_actions = icm_actions.reshape([runner.batch_ob_shape[0]]) next_states = next_states.reshape(runner.batch_ob_shape) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps , on_policy=on_policy , next_states = next_states, icm_actions=icm_actions ) else : names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps,on_policy=on_policy, next_states = None, icm_actions = None ) if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps/(time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular()
def learn(policy, env, seed, ob_space, ac_space, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, sil_update=4, sil_beta=0.0, save_dir=None): set_global_seeds(seed) #ob_space = env.observation_space #ac_space = env.action_space # print('ac_space:',ac_space) nenvs = env.num_envs model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, sil_update=sil_update, sil_beta=sil_beta) runner = Runner(env, model, ob_space=ob_space, nsteps=nsteps, gamma=gamma) episode_stats = EpisodeStats(nsteps, nenvs) nbatch = nenvs * nsteps tstart = time.time() for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, raw_rewards = runner.run() episode_stats.feed(raw_rewards, masks) policy_loss, value_loss, policy_entropy, v_avg = model.train( obs, states, rewards, masks, actions, values) sil_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train() model.save(save_dir) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "episode_reward", episode_stats.mean_reward()) logger.record_tabular("best_episode_reward", float(model.sil.get_best_reward())) if sil_update > 0: logger.record_tabular("sil_num_episodes", float(model.sil.num_episodes())) logger.record_tabular("sil_valid_samples", float(sil_samples)) logger.record_tabular( "sil_steps", float(model.sil.num_steps())) logger.dump_tabular() env.close() return model
class Acer(): def __init__(self, runner, model, buffer, log_interval): self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None def call(self, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps/(time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular()
class Acer(): def __init__(self, runner, model, buffer, log_interval, stats_interval): """ :param Runner runner: :param Model model: :param Buffer buffer: :param int log_interval: :param int stats_interval: """ self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None file_formatter = logging.Formatter('%(asctime)s %(message)s') stats_logger = logging.getLogger('stats_logger') stats_logger.setLevel(logging.INFO) # logger handlers stats_fh = logging.FileHandler(os.path.join(logger.get_dir(), 'results.log')) stats_fh.setFormatter(file_formatter) stats_logger.addHandler(stats_fh) self.stats_logger = stats_logger self.stats_interval = stats_interval def call(self, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0): logger.record_tabular("time", time.strftime('%m-%d %H:%M')) logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps/(time.time() - self.tstart))) logger.record_tabular("fph", '%.2fM' % ((steps/1e6)/((time.time() - self.tstart)/3600))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if on_policy and (int(steps/runner.nbatch) % self.stats_interval == 0): if hasattr(self.runner.env, 'stats'): envs_stats = self.runner.env.stats() avg_stats = {} # keys_of_lists: for each key related to a list has a list in which the i-th element counts the number # of envs that has an i-th element in the list related to that key keys_of_lists = {} # type: dict[str, list[int]] # init average stats for stats in envs_stats: for key, val in stats.items(): if not key in avg_stats: if isinstance(val, list): avg_stats[key] = [] keys_of_lists[key] = [] else: avg_stats[key] = 0 # collect stats for each environment for stats in envs_stats: for key, val in stats.items(): if isinstance(val, list): avg_list = avg_stats[key] counts = keys_of_lists[key] len_diff = len(val) - len(counts) if len_diff > 0: counts.extend([0]*len_diff) avg_list.extend([0]*len_diff) for i, v in enumerate(val): counts[i] += 1 avg_list[i] += v else: avg_stats[key] += val # average stats across envs for key, val in avg_stats.items(): if isinstance(val, list): counts = keys_of_lists[key] for i, v in enumerate(val): val[i] = v / counts[i] else: avg_stats[key] = val / len(envs_stats) avg_stats['global_t'] = steps self.stats_logger.info(' '.join('%s=%s' % (key, val) for key, val in avg_stats.items()))
class Acer(): def __init__(self, runner, model, buffer, log_interval, expert_buffer=None): self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None self.expert_buffer = [] #self.flag = 1 def call(self, perform, save_networks, use_expert, expert, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps expert_buffer = self.expert_buffer if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() runner.myrun() # if self.flag>0: # print(self.flag,'=================================') # print(enc_obs, obs, actions, rewards, mus, dones, masks) # self.flag = self.flag -1 self.episode_stats.feed(rewards, dones) if buffer is not None and not perform: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() #enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() if not perform: # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) if not use_expert: names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) else: expert_obs, expert_actions, expert_rewards, expert_mus, expert_dones, expert_masks = expert.get( ) expert_obs = expert_obs.reshape(runner.batch_ob_shape) expert_actions = expert_actions.reshape([runner.nbatch]) expert_rewards = expert_rewards.reshape([runner.nbatch]) expert_mus = expert_mus.reshape([runner.nbatch, runner.nact]) expert_dones = expert_dones.reshape([runner.nbatch]) expert_masks = expert_masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.expert_train( obs, actions, rewards, dones, mus, model.initial_state, masks, steps, expert_obs, expert_actions, expert_rewards, expert_mus, expert_dones, expert_masks) if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() if save_networks and (int(steps / runner.nbatch) % self.log_interval * 10 == 0): model.save(int(steps)) else: #if perform expert_buffer.append( [enc_obs, actions, rewards, mus, dones, masks]) if len(expert_buffer) > 0 and len(expert_buffer) % 100 == 0: expert_dir = os.path.join('./expert') + '/' if not os.path.exists(expert_dir): os.makedirs(expert_dir) pwritefile = open(os.path.join(expert_dir, 'expert_test.pkl'), 'wb') pickle.dump(expert_buffer, pwritefile, -1) pwritefile.close() logger.info('Successfully Saved the Expert Data') obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) logger.dump_tabular()
def learn(policy, env, seed, nsteps, nstack, total_timesteps, gamma, vf_coef, ent_coef, max_grad_norm, lr, lrschedule, rprop_epsilon=1e-5, rprop_alpha=0.99, log_interval=100): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space num_procs = len(env.remotes) # HACK model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, rprop_alpha=rprop_alpha, rprop_epsilon=rprop_epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma) episode_stats = EpisodeStats(nsteps, nenvs) nbatch = nenvs * nsteps tstart = time.time() for update in range(1, total_timesteps // nbatch + 1): obs, states, raw_rewards, returns, masks, actions, values = runner.run( ) ravg_norm_obs, policy_loss, value_loss, policy_entropy = model.train( obs, states, returns, masks, actions, values) #policy_loss, value_loss, policy_entropy = model.train(obs, states, returns, masks, actions, values) episode_stats.feed(raw_rewards, masks) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("avg_norm_obs", float(ravg_norm_obs)) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss * vf_coef)) logger.record_tabular("entropy_loss", float(-1 * policy_entropy * ent_coef)) logger.record_tabular( "total_loss", float(policy_loss - policy_entropy * ent_coef + value_loss * vf_coef)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("mean_episode_length", episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", episode_stats.mean_reward()) logger.dump_tabular() env.close()
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100): set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) episode_stats = EpisodeStats(nsteps, nenvs) nbatch = nenvs * nsteps tstart = time.time() global ifnext, a, b, obsuse, w # ifnext,a,b,obsuse=runner.runset(0) # np.save('runner2obsaver.npy',obsuse) b = 5197 ifnext = 0 global ifnext1, a1, b1, obsuse1 obsuse = np.load('runner2obsaver.npy') obsuse1 = np.load('runner2obsaver.npy') b1 = 5387 k = 0 w = 0 mean_r = np.zeros(1100000) for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values = runner.run() episode_stats.feed(rewards, masks) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if 2 * update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("episode_reward", episode_stats.mean_reward()) logger.dump_tabular() k = k + 1 mean_r[k] = episode_stats.mean_reward() np.save('mean_r.npy', mean_r) print(episode_stats.mean_reward()) env.close() return model
class Acer(): def __init__(self, runner, model, buffer, log_interval , curiosity, icm): self.runner = runner self.curiosity = curiosity self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None self.icm= icm def call(self, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps if on_policy: # if its not empty that next_state is contain states # print("\n\n\n !!! its on policy !!! \n\n\n") # enc_obs, enc_next_obs , obs, actions, rewards, mus, dones, masks, next_states, icm_actions , icm_rewards = runner.run() enc_obs, enc_next_obs , obs, actions, rewards, mus, dones, masks, next_states, icm_actions = runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: # buffer.put(enc_obs, enc_next_obs ,actions, rewards, mus, dones, masks, icm_actions , icm_rewards) buffer.put(enc_obs, enc_next_obs ,actions, rewards, mus, dones, masks, icm_actions ) else: # get obs, actions, rewards, mus, dones from buffer. # print("\n\n~~~~ now its off Policy ~~~\n\n") # obs, next_obs ,actions, rewards, mus, dones, masks, icm_actions, icm_rewards = buffer.get() obs, next_obs ,actions, rewards, mus, dones, masks, icm_actions= buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) if self.icm is not None : # print("2 icm Called here ") # icm_rewards = icm_rewards.reshape([runner.batch_ob_shape[0]]) icm_actions = icm_actions.reshape([runner.batch_ob_shape[0]]) if on_policy == False: next_states = next_obs.reshape(runner.batch_ob_shape) else : next_states = next_states.reshape(runner.batch_ob_shape) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps , next_states, icm_actions ) else : names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps , next_states = None, icm_actions = None ) if on_policy and (int(steps/runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps/(time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular()
def learn(policy, env, seed, nsteps=5, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100, sil_update=4, sil_beta=0.0): set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule, sil_update=sil_update, sil_beta=sil_beta) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) episode_stats = EpisodeStats(nsteps, nenvs) nbatch = nenvs * nsteps tstart = time.time() global ifnext, a, b, obsuse, w # ifnext,a,b,obsuse=runner.runset(0) # np.save('runner2obsaver.npy',obsuse) b = 5197 a = 4925 ifnext = 0 global ifnext1, a1, b1, obsuse1 obsuse = np.load('runner2obsaver.npy') obsuse1 = np.load('runner2obsaver.npy') b1 = 5387 print(ifnext, a, b) k = 0 w = 0 mean_r = np.zeros(1100000) best_r = np.zeros(1100000) for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, raw_rewards = runner.run( ) # obs,raw_rewards,gelabel=obsaver(obs,raw_rewards,gelabel,masks) # rewards = np.sign(raw_rewards) # if sum(raw_rewards)>99: # print(raw_rewards,masks) episode_stats.feed(raw_rewards, masks) policy_loss, value_loss, policy_entropy, v_avg = model.train( obs, states, rewards, masks, actions, values) sil_loss, sil_adv, sil_samples, sil_nlogp = model.sil_train() nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if 2 * update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) #print(values,rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("episode_reward", episode_stats.mean_reward()) logger.record_tabular("best_episode_reward", float(model.sil.get_best_reward())) mean_r[k] = episode_stats.mean_reward() best_r[k] = float(model.sil.get_best_reward()) print(episode_stats.mean_reward(), float(model.sil.get_best_reward())) k = k + 1 np.save('mean_r.npy', mean_r) np.save('best_r.npy', best_r) if sil_update > 0: logger.record_tabular("sil_num_episodes", float(model.sil.num_episodes())) logger.record_tabular("sil_valid_samples", float(sil_samples)) logger.record_tabular("sil_steps", float(model.sil.num_steps())) logger.dump_tabular() # if mean_r[k]>0.8 and k>4:#完成很高,基本收敛 # global ifnext1,a1,b1,obsuse1 # ifnext1,a1,b1,obsuse1=runner.runset(best_r[k]) # print(best_r[k]) # np.save('runner2obsaver1.npy',obsuse1) # print(ifnext1,a1,b1) # w=w+1 env.close() return model
class Acer(): def __init__(self, runner, model, buffer, log_interval, evaluate_env, evaluate_interval, evaluate_n, logdir): self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None self.evaluate_env = evaluate_env self.evaluate_interval = evaluate_interval self.evaluate_n = evaluate_n if logdir: self.summary_writer = tf.summary.FileWriter(logdir=logdir) self.logdir = logdir self.best_mean_reward = 0 self.evaluation_f = open(logdir + '/evaluation_monitor.csv', "wt") self.evaluation_logger = csv.DictWriter(self.evaluation_f, fieldnames=('r', 'l')) self.evaluation_logger.writeheader() else: self.summary_writer = None def call(self, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) if on_policy and (int(steps / runner.nbatch) % self.evaluate_interval == 0) and self.summary_writer: rewards_mean, length_mean = self.evaluate(self.evaluate_env, self.evaluate_n) # logger.record_tabular("mean_episode_length", rewards_mean) # logger.record_tabular("mean_episode_reward", length_mean) stats = tf.Summary(value=[ tf.Summary.Value(tag="reward_mean", simple_value=rewards_mean), tf.Summary.Value(tag="length_mean", simple_value=length_mean), ], ) self.summary_writer.add_summary(stats, steps) self.evaluation_logger.writerow({ 'r': rewards_mean, 'l': length_mean }) self.evaluation_f.flush() if rewards_mean > self.best_mean_reward: self.best_mean_reward = rewards_mean self.model.save(self.logdir + '/' + str(steps // 1e4) + '_' + str(rewards_mean)) if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() def evaluate(self, env, n): reward_total = 0 length_total = 0 for i in range(n): reward_episode, length_episode = self.runner.evaluate(env) reward_total += reward_episode length_total += length_episode reward_mean = reward_total / n length_mean = length_total / n return reward_mean, length_mean
class Acer(object): def __init__(self, runner, model, buffer, log_interval): """ Wrapper for the ACER model object :param runner: (AbstractEnvRunner) The runner to learn the policy of an environment for a model :param model: (Model) The model to learn :param buffer: (Buffer) The observation buffer :param log_interval: (int) The number of timesteps before logging. """ super(Acer, self).__init__() self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.t_start = None self.episode_stats = EpisodeStats(runner.n_steps, runner.n_env) self.steps = None def call(self, on_policy): """ Call a step with ACER :param on_policy: (bool) To step on policy and not on buffer """ runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.n_batch]) rewards = rewards.reshape([runner.n_batch]) mus = mus.reshape([runner.n_batch, runner.n_act]) dones = dones.reshape([runner.n_batch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) if on_policy and (int(steps / runner.n_batch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.t_start))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, # not just at the terminal state. Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular()
class Acer(): def __init__(self, model, buffer, log_interval): self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None def call(self, mini_batch, on_policy): model, buffer, steps = self.model, self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = mini_batch self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly # FIXME: Remove dependency to runner obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0): logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() def learn(self, mini_batch): if acer.tstart is None: acer.tstart = time.time() # nbatch samples, 1 on_policy call and multiple off-policy calls for acer.steps in range(0, total_timesteps, nbatch): acer.call(on_policy=True) if replay_ratio > 0 and buffer.has_atleast(replay_start): n = np.random.poisson(replay_ratio) for _ in range(n): # no simulation steps in this acer.call(mini_batch, on_policy=False) if save_interval and (acer.steps % save_interval == 0 or acer.steps == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.8i' % acer.steps) print('Saving to', savepath) model.save(savepath)
class Acer(): def __init__(self, runner, model, buffer, log_interval, eval_env): self.runner = runner self.model = model self.buffer = buffer self.log_interval = log_interval self.tstart = None self.episode_stats = EpisodeStats(runner.nsteps, runner.nenv) self.steps = None self.eval_env = eval_env def call(self, on_policy): runner, model, buffer, steps = self.runner, self.model, self.buffer, self.steps if on_policy: enc_obs, obs, actions, rewards, mus, dones, masks = runner.run() self.episode_stats.feed(rewards, dones) if buffer is not None: buffer.put(enc_obs, actions, rewards, mus, dones, masks) else: # get obs, actions, rewards, mus, dones from buffer. obs, actions, rewards, mus, dones, masks = buffer.get() # reshape stuff correctly obs = obs.reshape(runner.batch_ob_shape) actions = actions.reshape([runner.nbatch]) rewards = rewards.reshape([runner.nbatch]) mus = mus.reshape([runner.nbatch, runner.nact]) dones = dones.reshape([runner.nbatch]) masks = masks.reshape([runner.batch_ob_shape[0]]) names_ops, values_ops = model.train(obs, actions, rewards, dones, mus, model.initial_state, masks, steps) if on_policy and (int(steps / runner.nbatch) % self.log_interval == 0): # Evaluate. eval_episode_rewards = [] eval_qs = [] eval_obs = self.eval_env.reset() epilen = 0 epinfos = [] if self.eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32) for t_rollout in range(10000000): eval_action, eval_q, _, _ = self.model.step(eval_obs) eval_obs, eval_r, eval_done, eval_info = self.eval_env.step( eval_action ) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_episode_reward += eval_r for info in eval_info: maybeepinfo = info.get('episode') if maybeepinfo: epinfos.append(maybeepinfo) eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: epilen += 1 if epilen >= 10: break if self.eval_env is not None: logger.record_tabular( 'eval_eplenmean', np.mean(self.safemean([epinfo['l'] for epinfo in epinfos]))) logger.record_tabular( 'eval_eprewmean', np.mean(self.safemean([epinfo['r'] for epinfo in epinfos]))) logger.record_tabular("total_timesteps", steps) logger.record_tabular("fps", int(steps / (time.time() - self.tstart))) # IMP: In EpisodicLife env, during training, we get done=True at each loss of life, not just at the terminal state. # Thus, this is mean until end of life, not end of episode. # For true episode rewards, see the monitor files in the log folder. logger.record_tabular("mean_episode_length", self.episode_stats.mean_length()) logger.record_tabular("mean_episode_reward", self.episode_stats.mean_reward()) for name, val in zip(names_ops, values_ops): logger.record_tabular(name, float(val)) logger.dump_tabular() def safemean(self, xs): return np.nan if len(xs) == 0 else np.mean(xs)