def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from baselines.common.running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from baselines.common.running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.useReset0 = False if os.getenv( "useReset0") == "None" or os.getenv("useReset0") is None else eval( os.getenv('useReset0').capitalize()) logger.log(" useReset0 is %s" % str(self.useReset0))
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) try: self.num_agents = num_agents = len(self.observation_space) self.ob_rms = [ RunningMeanStd(shape=self.observation_space[k].shape) for k in range(num_agents) ] if ob else None except: self.num_agents = num_agents = len(self.observation_space.spaces) self.ob_rms = [ RunningMeanStd(shape=self.observation_space.spaces[k].shape) for k in range(num_agents) ] if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None #[RunningMeanStd(shape=()) for k in range(num_agents)] if ret else None self.clipob = clipob self.cliprew = cliprew # self.ret = [np.zeros(self.num_envs) for _ in range(num_agents)] self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): VecEnvWrapper.__init__(self, venv) if use_tf: from baselines.common.running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from baselines.common.running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, venv, ob=True, ret=True, train=True, noclip=False, has_timestep=False, ignore_mask=None, freeze_mask=None, time_scale=1e-3, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.train = train self.gamma = gamma self.epsilon = epsilon self.noclip = noclip self.ignore_mask = ignore_mask self.freeze_mask = freeze_mask self.has_timestep = has_timestep self.time_scale = time_scale
def __init__(self, venv, norm_obs=True, norm_reward=True, clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): """ A rolling average, normalizing, vectorized wrapepr for environment base class :param venv: ([Gym Environment]) the list of environments to vectorize and normalize :param norm_obs: (bool) normalize observation :param norm_reward: (bool) normalize reward with discounting (r = sum(r_old) * gamma + r_new) :param clip_obs: (float) clipping value for nomalizing observation :param clip_reward: (float) clipping value for nomalizing reward :param gamma: (float) discount factor :param epsilon: (float) epsilon value to avoid arithmetic issues """ VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if norm_obs else None self.ret_rms = RunningMeanStd(shape=()) if norm_reward else None self.clip_obs = clip_obs self.clip_reward = clip_reward self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.placeholder_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.placeholder_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: tf.get_default_session().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( tf.get_default_session(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.policy, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() if self.dynamics.dropout: self.rff2 = RewardForwardFilter(self.gamma) self.rff_rms2 = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): MTVecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, env, clip_ob=10, clip_rew=10, epsilon=1e-8, gamma=0.99): super().__init__(env) self.clip_ob = clip_ob self.clip_rew = clip_rew self._reset_rew() self.gamma = gamma self.epsilon = epsilon self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) self.ret_rms = RunningMeanStd(shape=())
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, reward_scale=1., update=True): VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon self.variables_name_save = ['clipob','cliprew','ret','gamma', 'epsilon' ] self.reward_scale = reward_scale self.update = update
def __init__(self, venv, ob=True, ret=True, clipob=5., cliprew=5., ext_gamma=0.999, int_gamma=0.999, epsilon=1e-8): super(VecNormalize, self).__init__(venv) self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ext_ret_rms = RunningMeanStd(shape=()) if ret else None self.int_ret_rms = RunningMeanStd(shape=()) if ret else None self.clipobs = clipob self.cliprew = cliprew self.ext_ret = np.zeros(self.num_envs) self.ext_gamma = ext_gamma self.int_ret = np.zeros(self.num_envs) self.int_gamma = int_gamma self.epsilon = epsilon
def __init__(self, *, env, model, nsteps, gamma, lam): super().__init__(env=env, model=model, nsteps=nsteps) # Lambda used in GAE (General Advantage Estimation) self.lam = lam # Discount rate self.gamma = gamma self.clipob = 10. self.cliprew = 10. self.epsilon = 1e-8 self.ret = 0 self.ob_rms = RunningMeanStd(shape=self.env.observation_space.shape) self.ret_rms = RunningMeanStd(shape=())
def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): VecEnv.__init__(self, observation_space=venv.observation_space, action_space=venv.action_space) print('bullet vec normalize 초기화 입니다. ') self.venv = venv self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(1) # TODO, self.num_envs self.gamma = gamma self.epsilon = epsilon
def __init__(self, curiosity_program, reward_combiner_program, curiosity_data_structure_values, curiosity_optimizer_values, reward_combiner_data_structure_values, reward_combiner_optimizer_values, envs, policy): self.curiosity_program = curiosity_program self.reward_combiner_program = reward_combiner_program self.curiosity_data_structure_values = curiosity_data_structure_values self.curiosity_optimizer_values = curiosity_optimizer_values self.reward_combiner_data_structure_values = reward_combiner_data_structure_values self.reward_combiner_optimizer_values = reward_combiner_optimizer_values self.envs = envs self.internal_reward_normalizer_all = mlca.helpers.statistics.welfords_std.Welford( ) self.internal_reward_normalizer_window: List[int] = [] # From https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py self.ret_rms = RunningMeanStd(shape=()) self.clipob = 10. self.cliprew = 10. self.ret = np.zeros(TspParams.current().NUM_ROLLOUTS_PER_TRIAL) self.gamma = TspParams.current().DECAY_RATE assert self.gamma == .99 self.epsilon = 1e-8
def __init__(self, num_inputs, input_size, action_space, hidden_size=64, recurrent=False, device='cpu'): super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size) self.device = device if recurrent: num_inputs = hidden_size init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), np.sqrt(2)) self.trunk = nn.Sequential( init__(nn.Linear(num_inputs + action_space.shape[0], hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, 1))) self.optimizer = torch.optim.Adam(self.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.train()
def __init__(self, venv, pretrained_reward_net_path, chain_path, embedding_dim, env_name): VecEnvWrapper.__init__(self, venv) self.reward_net = EmbeddingNet(embedding_dim) #load the pretrained weights self.reward_net.load_state_dict(torch.load(pretrained_reward_net_path)) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") #load the mean of the MCMC chain burn = 5000 skip = 20 reader = open(chain_path) data = [] for line in reader: parsed = line.strip().split(',') np_line = [] for s in parsed[:-1]: np_line.append(float(s)) data.append(np_line) data = np.array(data) #print(data[burn::skip,:].shape) #get average across chain and use it as the last layer in the network mean_weight = np.mean(data[burn::skip, :], axis=0) #print("mean weights", mean_weight[:-1]) #print("mean bias", mean_weight[-1]) #print(mean_weight.shape) self.reward_net.fc2 = nn.Linear( embedding_dim, 1, bias=False ) #last layer just outputs the scalar reward = w^T \phi(s) new_linear = torch.from_numpy(mean_weight) print("new linear", new_linear) print(new_linear.size()) with torch.no_grad(): #unsqueeze since nn.Linear wants a 2-d tensor for weights new_linear = new_linear.unsqueeze(0) #print("new linear", new_linear) #print("new bias", new_bias) with torch.no_grad(): #print(last_layer.weight) #print(last_layer.bias) #print(last_layer.weight.data) #print(last_layer.bias.data) self.reward_net.fc2.weight.data = new_linear.float().to( self.device) #TODO: print out last layer to make sure it stuck... print("USING MEAN WEIGHTS FROM MCMC") #with torch.no_grad(): # for param in self.reward_net.fc2.parameters(): # print(param) self.reward_net.to(self.device) self.rew_rms = RunningMeanStd(shape=()) self.epsilon = 1e-8 self.cliprew = 10. self.env_name = env_name
def __init__(self, device=None, envs=None, ensemble_policy=None, env_name=None, expert_dataset=None, ensemble_size=None, ensemble_quantile_threshold=None, dril_bc_model=None, dril_cost_clip=None, num_dril_bc_train_epoch=None,\ training_data_split=None): self.ensemble_quantile_threshold = ensemble_quantile_threshold self.dril_cost_clip = dril_cost_clip self.device = device self.num_dril_bc_train_epoch = num_dril_bc_train_epoch self.env_name = env_name self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.observation_space = envs.observation_space if envs.action_space.__class__.__name__ == "Discrete": self.num_actions = envs.action_space.n elif envs.action_space.__class__.__name__ == "Box": self.num_actions = envs.action_space.shape[0] elif envs.action_space.__class__.__name__ == "MultiBinary": self.num_actions = envs.action_space.shape[0] self.ensemble_size = ensemble_size # use full data since we don't use a validation set self.trdata = expert_dataset.load_demo_data( 1.0, 1, self.ensemble_size)['trdata'] self.ensemble = ensemble_policy self.bc = dril_bc_model self.bc.num_batches = num_dril_bc_train_epoch self.clip_variance = self.policy_variance(envs=envs)
def __init__(self, env, model, nsteps, icm, gamma, curiosity): super().__init__(env=env, model=model, nsteps=nsteps, icm=icm) assert isinstance( env.action_space, spaces.Discrete ), 'This ACER implementation works only with discrete action spaces!' assert isinstance(env, VecFrameStack) self.nact = env.action_space.n nenv = self.nenv self.nbatch = nenv * nsteps self.batch_ob_shape = (nenv * (nsteps + 1), ) + env.observation_space.shape self.curiosity = curiosity self.obs = env.reset() self.obs_dtype = env.observation_space.dtype self.ac_dtype = env.action_space.dtype self.nstack = self.env.nstack self.nc = self.batch_ob_shape[-1] // self.nstack self.rff = RewardForwardFilter(gamma) self.rff_rms = RunningMeanStd() # print(" What is NC " , self.nc) print(" State of curiosity : ", icm)
def __init__(self, input_dim, hidden_dim, device, red=None, sail=False, learn=True): super(Discriminator, self).__init__() self.device = device self.red = red self.sail = sail self.redtrained = False if self.sail: assert self.red is not None, 'Cannot run SAIL without using RED' self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.learn = learn self.optimizer = torch.optim.Adam(self.trunk.parameters()) self.returns = None self.ret_rms = RunningMeanStd(shape=())
def __init__(self, input_dim, action_dim, hidden_size=100, embed_size=0, base=None, base_kwargs=None, device='cpu'): super(CorDiscriminator, self).__init__() if base_kwargs is None: base_kwargs = {} if base is None: if len(input_dim) == 3: base = CNNBase elif len(input_dim) == 1: base = MLPBase else: raise NotImplementedError self.base = base(input_dim[0], input_dim[1:], action_dim, hidden_size, embed_size, device=device, **base_kwargs) self.parameters = self.base.parameters() self.returns = None self.ret_rms = RunningMeanStd(shape=())
def __init__(self, num_inputs, input_size, action_space, hidden_size=64, embed_size=0, recurrent=False, device='cpu'): super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size, embed_size) self.device = device if recurrent: num_inputs = hidden_size init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), np.sqrt(2)) self.trunk = nn.Sequential( init__( nn.Linear(num_inputs + action_space.shape[0] + embed_size, hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(), init__(nn.Linear(hidden_size, 1))) # self.optimizer = torch.optim.Adam(self.parameters(), lr= 3e-5) self.optimizer = torch.optim.RMSprop(self.parameters(), lr=5e-5) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.train()
def __init__(self, input_dim, hidden_dim, device, gail_reward_type=None, clip_gail_action=None, envs=None, disc_lr=None): super(Discriminator, self).__init__() self.device = device self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1)).to(device) self.trunk.train() self.optimizer = torch.optim.Adam(self.trunk.parameters(), lr=disc_lr) self.returns = None self.ret_rms = RunningMeanStd(shape=()) self.reward_type = gail_reward_type self.clip_gail_action = clip_gail_action self.action_space = envs.action_space
def __init__(self, venv, model_dir, ctrl_coeff=0., alive_bonus=0.): super().__init__(venv, model_dir, ctrl_coeff, alive_bonus) self.rew_rms = [ RunningMeanStd(shape=()) for _ in range(len(self.models)) ] self.cliprew = 100. self.epsilon = 1e-8
def __call__(self, x): x = np.asarray(x) if self.rms is None: self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:]) if not self.read_only: self.rms.update(x) return np.clip((x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon), -self.clip, self.clip)
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) self.caculate_number_parameters(params) flow_params = [v for v in params if 'flow' in v.name] other_params = [v for v in params if 'flow' not in v.name] print('length of flow params: ', len(flow_params)) print('length of agent params: ', len(other_params)) trainer_flow = tf.train.AdamOptimizer(learning_rate=self.flow_lr) trainer_agent = tf.train.AdamOptimizer(learning_rate=self.ph_lr) grads = tf.gradients(self.total_loss, flow_params + other_params) grads_flow = grads[:len(flow_params)] grads_agent = grads[len(flow_params):] train_flow = trainer_flow.apply_gradients(zip(grads_flow, flow_params)) train_agent = trainer_agent.apply_gradients(zip(grads_agent, other_params)) self._train = tf.group(train_flow, train_agent) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump)] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def __init__(self, envs, size_obs_to_norm=13, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.95, epsilon=1e-8, use_tf=False): self.envs = envs self.size_obs_to_norm = size_obs_to_norm if use_tf: from baselines.common.running_mean_std import TfRunningMeanStd self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None else: from baselines.common.running_mean_std import RunningMeanStd self.ob_rms = RunningMeanStd(shape=(size_obs_to_norm,)) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): nbatch = nenv*nsteps ob_shape = (nbatch, ob_space.shape[0]*nstack) nact = ac_space.shape[0] X = tf.placeholder(tf.float32, ob_shape) #obs self.pdtype = pdtype = make_pdtype(ac_space) with tf.variable_scope("obfilter", reuse=reuse): self.ob_rms = RunningMeanStd(shape=ob_shape[1:]) with tf.variable_scope("retfilter", reuse=reuse): self.ret_rms = RunningMeanStd(shape=(1,)) obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) #obz = X with tf.variable_scope("model", reuse=reuse): h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0)) h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0)) mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0) logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer()) logstd = tf.expand_dims(logstd, 0) pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1) vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0) v0 = vf[:, 0] self.pd = pdtype.pdfromflat(pdparam) stochastic = tf.placeholder(dtype=tf.bool, shape=()) a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode()) self.initial_state = [] #not stateful def step(stoch, ob, *_args, **_kwargs): a, v = sess.run([a0, v0], {stochastic:stoch, X:ob}) return a, v, [] #dummy state def value(ob, *_args, **_kwargs): return sess.run(v0, {X:ob}) self.X = X self.vf = vf self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std self.step = step self.value = value
def __init__(self, num_inputs, input_size, action_space, hidden_size=512, embed_size=0, recurrent=False, device='cpu'): super(CNNBase, self).__init__(recurrent, num_inputs, hidden_size, embed_size) self.device = device self.action_space = action_space h, w = input_size self.conv1 = nn.Conv2d(num_inputs, 32, kernel_size=8, stride=4) w_out = conv2d_size_out(w, kernel_size=8, stride=4) h_out = conv2d_size_out(h, kernel_size=8, stride=4) self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2) w_out = conv2d_size_out(w_out, kernel_size=4, stride=2) h_out = conv2d_size_out(h_out, kernel_size=4, stride=2) self.conv3 = nn.Conv2d(64, 32, kernel_size=3, stride=1) w_out = conv2d_size_out(w_out, kernel_size=3, stride=1) h_out = conv2d_size_out(h_out, kernel_size=3, stride=1) init_cnn_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), nn.init.calculate_gain('relu')) self.cnn_trunk = nn.Sequential( init_cnn_(self.conv1), nn.ReLU(), init_cnn_(self.conv2), nn.ReLU(), init_cnn_(self.conv3), nn.ReLU(), Flatten(), init_cnn_(nn.Linear(32 * h_out * w_out, hidden_size)), nn.ReLU()) init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init. constant_(x, 0), np.sqrt(2)) self.trunk = nn.Sequential( init__( nn.Linear(hidden_size + self.action_space.n + embed_size, hidden_size // 2)), nn.Tanh(), init__(nn.Linear(hidden_size // 2, hidden_size // 2)), nn.Tanh(), init__(nn.Linear(hidden_size // 2, 1))) # self.optimizer = torch.optim.Adam(self.parameters(), lr=3e-5) self.optimizer = torch.optim.RMSprop( self.parameters(), lr=5e-5 ) # To be conistent with the wgan optimizer, althougt not necessary self.returns = None self.ret_rms = RunningMeanStd(shape=()) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def __call__(self, x): from baselines.common.running_mean_std import RunningMeanStd x = np.asarray(x) if self.rms is None: self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:]) if not self.read_only: self.rms.update(x) return np.clip( (x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon), -self.clip, self.clip)
def __init__( self, venv, ob=False, ret=False, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8 ): # Akhil: add running mean and variance here so the correct mean and var can be inputted here when a model is loaded! VecEnvWrapper.__init__(self, venv) self.ob_rms = RunningMeanStd( shape=self.observation_space.shape) if ob else None self.ret_rms = RunningMeanStd(shape=()) if ret else None self.clipob = clipob self.cliprew = cliprew self.ret = np.zeros(self.num_envs) self.gamma = gamma self.epsilon = epsilon
def start_interaction(self, env_fns, dynamics, nlump=2): # 在开始与环境交互时定义变量和计算图, 初始化 rollout 类 self.loss_names, self._losses = zip(*list(self.to_report.items())) # 定义损失、梯度和反向传播. 在训练时调用 sess.run(self._train) 进行迭代 params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) params_dvae = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="dvae_reward") print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in params])) # 6629459 print("dvae params:", np.sum([np.prod(v.get_shape().as_list()) for v in params_dvae])) # 2726144 if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) # add bai. 单独计算 DVAE 的梯度 gradsandvars_dvae = trainer.compute_gradients(self.dynamics_loss, params_dvae) self._train_dvae = trainer.apply_gradients(gradsandvars_dvae) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) # 默认 128 self.nlump = nlump # 默认 1 self.lump_stride = nenvs // self.nlump # 128/1=128 self.envs = [ VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump)] # 该类在 rollouts.py 中定义 self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) # 环境数(线程数), 周期T self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()