def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, venv, pretrained_reward_net_path, chain_path, embedding_dim, env_name): VecEnvWrapper.__init__(self, venv) self.reward_net = EmbeddingNet(embedding_dim) #load the pretrained weights self.reward_net.load_state_dict(torch.load(pretrained_reward_net_path)) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") #load the mean of the MCMC chain burn = 5000 skip = 20 reader = open(chain_path) data = [] for line in reader: parsed = line.strip().split(',') np_line = [] for s in parsed[:-1]: np_line.append(float(s)) data.append(np_line) data = np.array(data) #print(data[burn::skip,:].shape) #get average across chain and use it as the last layer in the network mean_weight = np.mean(data[burn::skip, :], axis=0) #print("mean weights", mean_weight[:-1]) #print("mean bias", mean_weight[-1]) #print(mean_weight.shape) self.reward_net.fc2 = nn.Linear( embedding_dim, 1, bias=False ) #last layer just outputs the scalar reward = w^T \phi(s) new_linear = torch.from_numpy(mean_weight) print("new linear", new_linear) print(new_linear.size()) with torch.no_grad(): #unsqueeze since nn.Linear wants a 2-d tensor for weights new_linear = new_linear.unsqueeze(0) #print("new linear", new_linear) #print("new bias", new_bias) with torch.no_grad(): #print(last_layer.weight) #print(last_layer.bias) #print(last_layer.weight.data) #print(last_layer.bias.data) self.reward_net.fc2.weight.data = new_linear.float().to( self.device) #TODO: print out last layer to make sure it stuck... print("USING MEAN WEIGHTS FROM MCMC") #with torch.no_grad(): # for param in self.reward_net.fc2.parameters(): # print(param) self.reward_net.to(self.device) self.rew_rms = RunningMeanStd(shape=()) self.epsilon = 1e-8 self.cliprew = 10. self.env_name = env_name
def __init__(self, input_dim, hidden_dim, device): super(Discriminator, self).__init__() self.device = device self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.optimizer = torch.optim.Adam(self.trunk.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=())
def rms_from_csv(path): with open(path, 'r') as file: reader = csv.reader(file) values = [] for r in reader: values.append([float(i) for i in r]) mean = np.array(values[0]) var = np.array(values[1]) rms = RunningMeanStd(shape = mean.shape) rms.mean = mean rms.var = var return rms
def __init__(self, observation_space, action_space, device, args, log_only=False): super(AIL, self).__init__() if log_only: self.m_return_list = self.load_expert_data(args) return self.lr = args.il_lr # larger learning rate for MLP self.action_dim = action_space.shape[0] self.hidden_dim = 100 self.state_dim = observation_space.shape[0] # self.device = device self.create_networks() self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.gail_batch_size = args.gail_batch_size self.label_expert = 1 self.label_policy = -1 self.reward_std = args.reward_std self.gp_lambda = args.gp_lambda self.m_return_list = self.make_dataset(args) if args.ail_saturate is None and args.ail_loss_type != "unhinged": args.ail_saturate = 1 if args.ail_loss_type == "logistic": self.adversarial_loss = Logistic_Loss() elif args.ail_loss_type == "unhinged": self.adversarial_loss = Unhinged_Loss() if args.ail_saturate is None: args.ail_saturate = 0 elif args.ail_loss_type == "sigmoid": self.adversarial_loss = Sigmoid_Loss() elif args.ail_loss_type == "nlogistic": self.adversarial_loss = Normalized_Logistic_Loss() elif args.ail_loss_type == "apl": self.adversarial_loss = APL_Loss() self.ail_saturate = args.ail_saturate
class VecNormalize(VecEnvWrapper): """ Vectorized environment base class """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): """ Apply sequence of actions to sequence of environments actions -> (observations, rewards, news) where 'news' is a boolean vector indicating whether each element is new. """ obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma * (1 - news) + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: tmp = copy.deepcopy(obs) self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) for i in range(len(tmp)): obs[i][-6:] = tmp[i][-6:] return obs else: return obs def reset(self): """ Reset all environments """ obs = self.venv.reset() return self._obfilt(obs)
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): obs = self.venv.reset() return self._obfilt(obs)
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
class MeanStdNormalizer(BaseNormalizer): def __init__(self, read_only=False, clip=10.0, epsilon=1e-8): BaseNormalizer.__init__(self, read_only) self.read_only = read_only self.rms = None self.clip = clip self.epsilon = epsilon def __call__(self, x): x = np.asarray(x) if self.rms is None: self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:]) if not self.read_only: self.rms.update(x) return np.clip( (x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon), -self.clip, self.clip)
def __init__(self, venv, model_dir, ctrl_coeff=0., alive_bonus=0.): super().__init__(venv, model_dir, ctrl_coeff, alive_bonus) self.rew_rms = [ RunningMeanStd(shape=()) for _ in range(len(self.models)) ] self.cliprew = 100. self.epsilon = 1e-8
def no_mpi_start_interaction(self, envs, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) last_step = 0 if self.load: last_step = self._load_graph() else: self._initialize_graph() # bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(envs) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = envs self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.rollout.stats['tcount'] += last_step self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv*nsteps ob_shape = (nbatch, ob_space.shape[0]*nstack) nact = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape) #obs self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("obfilter", reuse=reuse): self.ob_rms = RunningMeanStd(shape=ob_shape[1:]) with tf.variable_scope("retfilter", reuse=reuse): self.ret_rms = RunningMeanStd(shape=(1,)) obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #obz = X with tf.variable_scope("model", reuse=reuse): h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0)) mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0) logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer()) logstd = tf.expand_dims(logstd, 0) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0) v0 = vf[:, 0] self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.initial_state = [] #not stateful def step(stoch, ob, *_args, **_kwargs): a, v = sess.run([a0, v0], {stochastic:stoch, X:ob}) return a, v, [] #dummy state def value(ob, *_args, **_kwargs): return sess.run(v0, {X:ob}) self.X = X self.vf = vf self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std self.step = step self.value = value
def start_interaction(self, env_fns, dynamics, nlump=2): print("start_interaction",env_fns) self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs=256 self.nlump = 1 self.lump_stride = nenvs // self.nlump self.envs = [env_fns ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
class VecNormalize(VecEnvWrapper): def __init__(self, venv, visual_obs=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.spaces['visual'].shape) if visual_obs else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.training = True def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs['visual'] = self._obfilt(obs['visual']) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs, update=True): if self.ob_rms: if self.training and update: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() obs['visual'] = self._obfilt(obs['visual']) return obs def train(self): self.training = True def eval(self): self.training = False
def __init__( self, venv, ob=False, ret=False, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8 ): # Akhil: add running mean and variance here so the correct mean and var can be inputted here when a model is loaded! VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
class Normalize(gym.Wrapper): """ A wrapper that normalizes the observations and returns from an environment. """ def __init__(self, env, clip_ob=10, clip_rew=10, epsilon=1e-8, gamma=0.99): super().__init__(env) self.clip_ob = clip_ob self.clip_rew = clip_rew self._reset_rew() self.gamma = gamma self.epsilon = epsilon self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=()) def step(self, action): obs, rew, done, misc = self.env.step(action) self.ret = self.ret * self.gamma + rew self.ret_rms.update(self.ret) rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_rew, self.clip_rew) if done: self._reset_rew() obs = self._ob_filter(obs) return obs, rew, done, misc def reset(self): self._reset_rew() obs = self.env.reset() return self._ob_filter(obs) def _ob_filter(self, obs): self.ob_rms.update(obs) obs = np.clip( (obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clip_ob, self.clip_ob) return obs def _reset_rew(self): self.ret = np.zeros((1, ), dtype=np.float32)
def __init__(self, venv, nets, ctrl_coeff): RewardWrapper.__init__(self, venv) self.venv = venv self.ctrl_coeff = ctrl_coeff # TODO change for one net self.nets = [nets] self.cliprew = 10. self.epsilon = 1e-8 self.rew_rms = [RunningMeanStd(shape=()) for _ in range(len(self.nets))]
def __init__(self, venv, reward_net_path, env_name): VecEnvWrapper.__init__(self, venv) self.reward_net = AtariNet() self.reward_net.load_state_dict(torch.load(reward_net_path)) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.reward_net.to(self.device) self.rew_rms = RunningMeanStd(shape=()) self.epsilon = 1e-8 self.cliprew = 10. self.env_name = env_name
def __init__(self, venv, reward_net_path, combo_param): VecEnvWrapper.__init__(self, venv) self.reward_net = AtariNet() self.reward_net.load_state_dict(torch.load(reward_net_path)) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.reward_net.to(self.device) self.lamda = combo_param #how much weight to give to IRL verus RL combo_param \in [0,1] with 0 being RL and 1 being IRL self.rew_rms = RunningMeanStd(shape=()) self.epsilon = 1e-8 self.cliprew = 10.
def __init__(self, state_dim, action_dim, user_dim, device, lr): super(Discriminator, self).__init__() self.device = device self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.label_embedding = nn.Embedding(10, 10) self.prefc1 = nn.Linear(user_dim, 25) self.linear = nn.Linear(state_dim * 6 + action_dim, 81) self.relu = nn.LeakyReLU(0.2, inplace=True) self.conv1 = nn.Conv2d(1, 2, 3) self.pool = nn.MaxPool2d(2, 1) self.conv2 = nn.Conv2d(2, 20, 3) self.conv2_bn = nn.BatchNorm2d(20) self.fc1 = nn.Linear(180, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 1) self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
def __init__(self, env, model, nsteps, icm , gamma , curiosity): super().__init__(env=env, model=model, nsteps=nsteps , icm=icm) assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!' assert isinstance(env, VecFrameStack) self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape self.obs = env.reset() self.obs_dtype = env.observation_space.dtype self.ac_dtype = env.action_space.dtype self.nstack = self.env.nstack self.nc = self.batch_ob_shape[-1] // self.nstack # > self.curiosity = curiosity if self.curiosity : self.rff = RewardForwardFilter(gamma) self.rff_rms = RunningMeanStd()
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from baselines.common.running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from baselines.common.running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.useReset0 = False if os.getenv( "useReset0") == "None" or os.getenv("useReset0") is None else eval( os.getenv('useReset0').capitalize()) logger.log(" useReset0 is %s" % str(self.useReset0))
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from baselines.common.running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from baselines.common.running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, obs_shape, hidden_dim, num_actions, device, disc_lr,\ gail_reward_type=None, envs=None): super(DiscriminatorCNN, self).__init__() self.device = device init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain('relu')) self.num_actions = num_actions self.action_emb = nn.Embedding(num_actions, num_actions).cuda() num_inputs = obs_shape.shape[0] + num_actions self.cnn = nn.Sequential(init_(nn.Conv2d(num_inputs, 32, 8, stride=4)), nn.ReLU(), init_(nn.Conv2d(32, 64, 4, stride=2)), nn.ReLU(), init_(nn.Conv2d(64, 32, 3, stride=1)), nn.ReLU(), Flatten(), init_(nn.Linear(32 * 7 * 7, hidden_dim)), nn.ReLU()).to(device) self.trunk = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.cnn.train() self.trunk.train() self.optimizer = torch.optim.Adam(list(self.trunk.parameters()) + list(self.cnn.parameters()), lr=disc_lr) self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.reward_type = gail_reward_type
class VecNormalize(VecEnvWrapper): """ A vectorized wrapper that normalizes the observations and returns from an environment. """ def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon def step_wait(self): obs, rews, news, infos = self.venv.step_wait() self.ret = self.ret * self.gamma + rews obs = self._obfilt(obs) if self.ret_rms: self.ret_rms.update(self.ret) rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) self.ret[news] = 0. return obs, rews, news, infos def _obfilt(self, obs): if self.ob_rms: self.ob_rms.update(obs) obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) return obs else: return obs def reset(self): self.ret = np.zeros(self.num_envs) obs = self.venv.reset() return self._obfilt(obs)
def __init__(self, obs_norm): self.env = gym.make('Pendulum-v0') self.rms = { 'rewards': RunningMeanStd(epsilon=1e-9, shape=(1,)), 'returns': RunningMeanStd(epsilon=1e-9, shape=(1,)), } self.ep_total_reward_list = [] self.reward_list = [] self.obs_norm = obs_norm self.build_network() # # Create session # self.sess = tf.Session() # self.sess.run(tf.global_variables_initializer()) # baselines.common.tf_util self.sess = get_session() initialize()
def __init__( self, venv: Env, ob: bool = True, ret: bool = True, clipob: float = 10.0, cliprew: float = 10.0, gamma: float = 0.99, epsilon: float = 1e-8, first_n: int = None, ) -> None: """ Modified init function of VecNormalize. The only change here is in modifying the shape of self.ob_rms. The argument ``first_n`` controls how much of the observation we want to normalize: for an observation ``obs``, we normalize the vector ``obs[:first_n]``. """ VecEnvWrapper.__init__(self, venv) if ob is not None: if first_n is None: self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) else: if len(self.observation_space.shape) == 1: self.ob_rms = RunningMeanStd(shape=(first_n, )) else: raise NotImplementedError else: self.ob_rms = None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.first_n = first_n
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) if isinstance(self.observation_space, Dict): self.ob_rms = {} for key in self.observation_space.spaces.keys(): self.ob_rms[key] = RunningMeanStd( shape=self.observation_space.spaces[key].shape ) if ob else None else: self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, input_dim, hidden_dim, device, reward_type, update_rms, cliprew_down=-10.0, cliprew_up=10.0): super(Discriminator, self).__init__() self.cliprew_down = cliprew_down self.cliprew_up = cliprew_up self.device = device self.reward_type = reward_type self.update_rms = update_rms # self.trunk = nn.Sequential( # nn.Linear(input_dim, hidden_dim), nn.Tanh(), # nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), # nn.Linear(hidden_dim, 1), nn.Tanh()).to(device) self.trunk = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.optimizer = torch.optim.Adam(self.trunk.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=())
def __init__(self, venv, num_models, model_dir, include_action, num_layers, embedding_dims, ctrl_coeff=0., alive_bonus=0.): super().__init__(venv, num_models, model_dir, include_action, num_layers, embedding_dims, ctrl_coeff, alive_bonus) self.rew_rms = [RunningMeanStd(shape=()) for _ in range(num_models)] self.cliprew = 10. self.epsilon = 1e-8
def test_runningmeanstd(): for (x1, x2, x3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), ]: rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) x = np.concatenate([x1, x2, x3], axis=0) ms1 = [x.mean(axis=0), x.var(axis=0)] rms.update(x1) rms.update(x2) rms.update(x3) ms2 = [rms.mean, rms.var] assert np.allclose(ms1, ms2)
def test_runningmeanstd(): """Test RunningMeanStd object""" for (x_1, x_2, x_3) in [ (np.random.randn(3), np.random.randn(4), np.random.randn(5)), (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)) ]: rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:]) x_cat = np.concatenate([x_1, x_2, x_3], axis=0) moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)] rms.update(x_1) rms.update(x_2) rms.update(x_3) moments_2 = [rms.mean, rms.var] assert np.allclose(moments_1, moments_2)