Esempio n. 1
0
 def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
Esempio n. 2
0
    def __init__(self, venv, pretrained_reward_net_path, chain_path,
                 embedding_dim, env_name):
        VecEnvWrapper.__init__(self, venv)
        self.reward_net = EmbeddingNet(embedding_dim)
        #load the pretrained weights
        self.reward_net.load_state_dict(torch.load(pretrained_reward_net_path))
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        #load the mean of the MCMC chain
        burn = 5000
        skip = 20
        reader = open(chain_path)
        data = []
        for line in reader:
            parsed = line.strip().split(',')
            np_line = []
            for s in parsed[:-1]:
                np_line.append(float(s))
            data.append(np_line)
        data = np.array(data)
        #print(data[burn::skip,:].shape)

        #get average across chain and use it as the last layer in the network
        mean_weight = np.mean(data[burn::skip, :], axis=0)
        #print("mean weights", mean_weight[:-1])
        #print("mean bias", mean_weight[-1])
        #print(mean_weight.shape)
        self.reward_net.fc2 = nn.Linear(
            embedding_dim, 1, bias=False
        )  #last layer just outputs the scalar reward = w^T \phi(s)

        new_linear = torch.from_numpy(mean_weight)
        print("new linear", new_linear)
        print(new_linear.size())
        with torch.no_grad():
            #unsqueeze since nn.Linear wants a 2-d tensor for weights
            new_linear = new_linear.unsqueeze(0)
            #print("new linear", new_linear)
            #print("new bias", new_bias)
            with torch.no_grad():
                #print(last_layer.weight)
                #print(last_layer.bias)
                #print(last_layer.weight.data)
                #print(last_layer.bias.data)
                self.reward_net.fc2.weight.data = new_linear.float().to(
                    self.device)

            #TODO: print out last layer to make sure it stuck...
            print("USING MEAN WEIGHTS FROM MCMC")
            #with torch.no_grad():
            #    for param in self.reward_net.fc2.parameters():
            #        print(param)

        self.reward_net.to(self.device)

        self.rew_rms = RunningMeanStd(shape=())
        self.epsilon = 1e-8
        self.cliprew = 10.
        self.env_name = env_name
Esempio n. 3
0
    def __init__(self, input_dim, hidden_dim, device):
        super(Discriminator, self).__init__()

        self.device = device

        self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.optimizer = torch.optim.Adam(self.trunk.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())
Esempio n. 4
0
def rms_from_csv(path): 

    with open(path, 'r') as file:
        reader = csv.reader(file)
        values = []

        for r in reader: 
            values.append([float(i) for i in r])

    mean = np.array(values[0])
    var = np.array(values[1])

    rms = RunningMeanStd(shape = mean.shape)
    rms.mean = mean 
    rms.var = var

    return rms
Esempio n. 5
0
    def __init__(self,
                 observation_space,
                 action_space,
                 device,
                 args,
                 log_only=False):
        super(AIL, self).__init__()

        if log_only:
            self.m_return_list = self.load_expert_data(args)
            return

        self.lr = args.il_lr  # larger learning rate for MLP
        self.action_dim = action_space.shape[0]
        self.hidden_dim = 100

        self.state_dim = observation_space.shape[0]  #

        self.device = device
        self.create_networks()

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.gail_batch_size = args.gail_batch_size
        self.label_expert = 1
        self.label_policy = -1
        self.reward_std = args.reward_std
        self.gp_lambda = args.gp_lambda
        self.m_return_list = self.make_dataset(args)

        if args.ail_saturate is None and args.ail_loss_type != "unhinged":
            args.ail_saturate = 1

        if args.ail_loss_type == "logistic":
            self.adversarial_loss = Logistic_Loss()
        elif args.ail_loss_type == "unhinged":
            self.adversarial_loss = Unhinged_Loss()
            if args.ail_saturate is None: args.ail_saturate = 0
        elif args.ail_loss_type == "sigmoid":
            self.adversarial_loss = Sigmoid_Loss()
        elif args.ail_loss_type == "nlogistic":
            self.adversarial_loss = Normalized_Logistic_Loss()
        elif args.ail_loss_type == "apl":
            self.adversarial_loss = APL_Loss()
        self.ail_saturate = args.ail_saturate
class VecNormalize(VecEnvWrapper):
    """
    Vectorized environment base class
    """
    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
        self.ret_rms = None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        """
        Apply sequence of actions to sequence of environments
        actions -> (observations, rewards, news)

        where 'news' is a boolean vector indicating whether each element is new.
        """
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma * (1 - news) + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            tmp = copy.deepcopy(obs)
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
            for i in range(len(tmp)):
                obs[i][-6:] = tmp[i][-6:]
            return obs
        else:
            return obs

    def reset(self):
        """
        Reset all environments
        """
        obs = self.venv.reset()
        return self._obfilt(obs)
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(
            shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon),
                           -self.cliprew, self.cliprew)
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        obs = self.venv.reset()
        return self._obfilt(obs)
Esempio n. 8
0
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8):
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(
         shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
Esempio n. 9
0
class MeanStdNormalizer(BaseNormalizer):
    def __init__(self, read_only=False, clip=10.0, epsilon=1e-8):
        BaseNormalizer.__init__(self, read_only)
        self.read_only = read_only
        self.rms = None
        self.clip = clip
        self.epsilon = epsilon

    def __call__(self, x):
        x = np.asarray(x)
        if self.rms is None:
            self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:])
        if not self.read_only:
            self.rms.update(x)
        return np.clip(
            (x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon),
            -self.clip, self.clip)
    def __init__(self, venv, model_dir, ctrl_coeff=0., alive_bonus=0.):
        super().__init__(venv, model_dir, ctrl_coeff, alive_bonus)

        self.rew_rms = [
            RunningMeanStd(shape=()) for _ in range(len(self.models))
        ]
        self.cliprew = 100.
        self.epsilon = 1e-8
Esempio n. 11
0
    def no_mpi_start_interaction(self, envs, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)
        last_step = 0
        if self.load:
            last_step = self._load_graph()
        else:
            self._initialize_graph()
            # bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(envs)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = envs

        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)
        self.rollout.stats['tcount'] += last_step

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
Esempio n. 12
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv*nsteps
        ob_shape = (nbatch, ob_space.shape[0]*nstack)
        nact = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape) #obs
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("obfilter", reuse=reuse):
            self.ob_rms = RunningMeanStd(shape=ob_shape[1:])
        with tf.variable_scope("retfilter", reuse=reuse):
            self.ret_rms = RunningMeanStd(shape=(1,))

        obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #obz = X

        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0))

            mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0)
            logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer())
            logstd = tf.expand_dims(logstd, 0)
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0)

        v0 = vf[:, 0]
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self.initial_state = [] #not stateful

        def step(stoch, ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {stochastic:stoch, X:ob})
            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X:ob})

        self.X = X
        self.vf = vf
        self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std
        self.step = step
        self.value = value
Esempio n. 13
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        print("start_interaction",env_fns)
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs=256
        self.nlump = 1
        self.lump_stride = nenvs // self.nlump
        self.envs = [env_fns
           ]

        self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
Esempio n. 14
0
class VecNormalize(VecEnvWrapper):
    def __init__(self, venv, visual_obs=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(shape=self.observation_space.spaces['visual'].shape) if visual_obs else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.training = True

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs['visual'] = self._obfilt(obs['visual'])
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs, update=True):
        if self.ob_rms:
            if self.training and update:
                self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) /
                          np.sqrt(self.ob_rms.var + self.epsilon),
                          -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        obs['visual'] = self._obfilt(obs['visual'])
        return obs

    def train(self):
        self.training = True

    def eval(self):
        self.training = False
Esempio n. 15
0
 def __init__(
     self,
     venv,
     ob=False,
     ret=False,
     clipob=10.,
     cliprew=10.,
     gamma=0.99,
     epsilon=1e-8
 ):  # Akhil: add running mean and variance here so the correct mean and var can be inputted here when a model is loaded!
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(
         shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
Esempio n. 16
0
class Normalize(gym.Wrapper):
    """
    A  wrapper that normalizes the observations
    and returns from an environment.
    """
    def __init__(self, env, clip_ob=10, clip_rew=10, epsilon=1e-8, gamma=0.99):
        super().__init__(env)
        self.clip_ob = clip_ob
        self.clip_rew = clip_rew
        self._reset_rew()
        self.gamma = gamma
        self.epsilon = epsilon

        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape)
        self.ret_rms = RunningMeanStd(shape=())

    def step(self, action):
        obs, rew, done, misc = self.env.step(action)
        self.ret = self.ret * self.gamma + rew
        self.ret_rms.update(self.ret)
        rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon),
                      -self.clip_rew, self.clip_rew)
        if done:
            self._reset_rew()

        obs = self._ob_filter(obs)
        return obs, rew, done, misc

    def reset(self):
        self._reset_rew()
        obs = self.env.reset()
        return self._ob_filter(obs)

    def _ob_filter(self, obs):
        self.ob_rms.update(obs)
        obs = np.clip(
            (obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon),
            -self.clip_ob, self.clip_ob)
        return obs

    def _reset_rew(self):
        self.ret = np.zeros((1, ), dtype=np.float32)
    def __init__(self, venv, nets, ctrl_coeff):
        RewardWrapper.__init__(self, venv)
        self.venv = venv
        self.ctrl_coeff = ctrl_coeff

        # TODO change for one net
        self.nets = [nets]

        self.cliprew = 10.
        self.epsilon = 1e-8
        self.rew_rms = [RunningMeanStd(shape=()) for _ in range(len(self.nets))]
Esempio n. 18
0
    def __init__(self, venv, reward_net_path, env_name):
        VecEnvWrapper.__init__(self, venv)
        self.reward_net = AtariNet()
        self.reward_net.load_state_dict(torch.load(reward_net_path))
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.reward_net.to(self.device)

        self.rew_rms = RunningMeanStd(shape=())
        self.epsilon = 1e-8
        self.cliprew = 10.
        self.env_name = env_name
Esempio n. 19
0
    def __init__(self, venv, reward_net_path, combo_param):
        VecEnvWrapper.__init__(self, venv)
        self.reward_net = AtariNet()
        self.reward_net.load_state_dict(torch.load(reward_net_path))
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.reward_net.to(self.device)

        self.lamda = combo_param  #how much weight to give to IRL verus RL combo_param \in [0,1] with 0 being RL and 1 being IRL
        self.rew_rms = RunningMeanStd(shape=())
        self.epsilon = 1e-8
        self.cliprew = 10.
Esempio n. 20
0
    def __init__(self, state_dim, action_dim, user_dim, device, lr):
        super(Discriminator, self).__init__()

        self.device = device

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.label_embedding = nn.Embedding(10, 10)
        self.prefc1 = nn.Linear(user_dim, 25)

        self.linear = nn.Linear(state_dim * 6 + action_dim, 81)
        self.relu = nn.LeakyReLU(0.2, inplace=True)
        self.conv1 = nn.Conv2d(1, 2, 3)
        self.pool = nn.MaxPool2d(2, 1)
        self.conv2 = nn.Conv2d(2, 20, 3)
        self.conv2_bn = nn.BatchNorm2d(20)
        self.fc1 = nn.Linear(180, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 1)
        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
Esempio n. 21
0
    def __init__(self, env, model, nsteps, icm , gamma , curiosity):
        super().__init__(env=env, model=model, nsteps=nsteps , icm=icm)
        assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!'
        assert isinstance(env, VecFrameStack)

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape

        self.obs = env.reset()
        self.obs_dtype = env.observation_space.dtype
        self.ac_dtype = env.action_space.dtype
        self.nstack = self.env.nstack
        self.nc = self.batch_ob_shape[-1] // self.nstack

        # >
        self.curiosity = curiosity
        if self.curiosity :
            self.rff = RewardForwardFilter(gamma)
            self.rff_rms = RunningMeanStd()
Esempio n. 22
0
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8,
              use_tf=False):
     VecEnvWrapper.__init__(self, venv)
     if use_tf:
         from baselines.common.running_mean_std import TfRunningMeanStd
         self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                        scope='ob_rms') if ob else None
         self.ret_rms = TfRunningMeanStd(shape=(),
                                         scope='ret_rms') if ret else None
     else:
         from baselines.common.running_mean_std import RunningMeanStd
         self.ob_rms = RunningMeanStd(
             shape=self.observation_space.shape) if ob else None
         self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
     self.useReset0 = False if os.getenv(
         "useReset0") == "None" or os.getenv("useReset0") is None else eval(
             os.getenv('useReset0').capitalize())
     logger.log(" useReset0 is %s" % str(self.useReset0))
Esempio n. 23
0
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8,
              use_tf=False):
     VecEnvWrapper.__init__(self, venv)
     if use_tf:
         from baselines.common.running_mean_std import TfRunningMeanStd
         self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                        scope='ob_rms') if ob else None
         self.ret_rms = TfRunningMeanStd(shape=(),
                                         scope='ret_rms') if ret else None
     else:
         from baselines.common.running_mean_std import RunningMeanStd
         self.ob_rms = RunningMeanStd(
             shape=self.observation_space.shape) if ob else None
         self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
Esempio n. 24
0
    def __init__(self, obs_shape, hidden_dim, num_actions, device, disc_lr,\
                 gail_reward_type=None, envs=None):
        super(DiscriminatorCNN, self).__init__()

        self.device = device

        init_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                               constant_(x, 0), nn.init.calculate_gain('relu'))

        self.num_actions = num_actions
        self.action_emb = nn.Embedding(num_actions, num_actions).cuda()
        num_inputs = obs_shape.shape[0] + num_actions

        self.cnn = nn.Sequential(init_(nn.Conv2d(num_inputs, 32, 8, stride=4)),
                                 nn.ReLU(),
                                 init_(nn.Conv2d(32, 64, 4, stride=2)),
                                 nn.ReLU(),
                                 init_(nn.Conv2d(64, 32, 3, stride=1)),
                                 nn.ReLU(), Flatten(),
                                 init_(nn.Linear(32 * 7 * 7, hidden_dim)),
                                 nn.ReLU()).to(device)

        self.trunk = nn.Sequential(nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim, 1)).to(device)

        self.cnn.train()
        self.trunk.train()

        self.optimizer = torch.optim.Adam(list(self.trunk.parameters()) +
                                          list(self.cnn.parameters()),
                                          lr=disc_lr)

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.reward_type = gail_reward_type
Esempio n. 25
0
class VecNormalize(VecEnvWrapper):
    """
    A vectorized wrapper that normalizes the observations
    and returns from an environment.
    """

    def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon

    def step_wait(self):
        obs, rews, news, infos = self.venv.step_wait()
        self.ret = self.ret * self.gamma + rews
        obs = self._obfilt(obs)
        if self.ret_rms:
            self.ret_rms.update(self.ret)
            rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
        self.ret[news] = 0.
        return obs, rews, news, infos

    def _obfilt(self, obs):
        if self.ob_rms:
            self.ob_rms.update(obs)
            obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
            return obs
        else:
            return obs

    def reset(self):
        self.ret = np.zeros(self.num_envs)
        obs = self.venv.reset()
        return self._obfilt(obs)
Esempio n. 26
0
    def __init__(self, obs_norm):

        self.env = gym.make('Pendulum-v0')

        self.rms = {
            'rewards': RunningMeanStd(epsilon=1e-9, shape=(1,)),
            'returns': RunningMeanStd(epsilon=1e-9, shape=(1,)),
        }

        self.ep_total_reward_list = []
        self.reward_list = []
        
        self.obs_norm = obs_norm

        self.build_network()
    
        # # Create session
        # self.sess = tf.Session()
        # self.sess.run(tf.global_variables_initializer())

        # baselines.common.tf_util
        self.sess = get_session()
        initialize()
Esempio n. 27
0
    def __init__(
        self,
        venv: Env,
        ob: bool = True,
        ret: bool = True,
        clipob: float = 10.0,
        cliprew: float = 10.0,
        gamma: float = 0.99,
        epsilon: float = 1e-8,
        first_n: int = None,
    ) -> None:
        """
        Modified init function of VecNormalize. The only change here is in modifying the
        shape of self.ob_rms. The argument ``first_n`` controls how much of the
        observation we want to normalize: for an observation ``obs``, we normalize the
        vector ``obs[:first_n]``.
        """

        VecEnvWrapper.__init__(self, venv)
        if ob is not None:
            if first_n is None:
                self.ob_rms = RunningMeanStd(
                    shape=self.observation_space.shape)
            else:
                if len(self.observation_space.shape) == 1:
                    self.ob_rms = RunningMeanStd(shape=(first_n, ))
                else:
                    raise NotImplementedError
        else:
            self.ob_rms = None
        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
        self.first_n = first_n
Esempio n. 28
0
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8):
     VecEnvWrapper.__init__(self, venv)
     if isinstance(self.observation_space, Dict):
         self.ob_rms = {}
         for key in self.observation_space.spaces.keys():
             self.ob_rms[key] = RunningMeanStd(
                 shape=self.observation_space.spaces[key].shape
             ) if ob else None
     else:
         self.ob_rms = RunningMeanStd(
             shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
Esempio n. 29
0
    def __init__(self, input_dim, hidden_dim, device, reward_type, update_rms, cliprew_down=-10.0, cliprew_up=10.0):
        super(Discriminator, self).__init__()
        self.cliprew_down = cliprew_down
        self.cliprew_up = cliprew_up
        self.device = device
        self.reward_type = reward_type
        self.update_rms = update_rms

        # self.trunk = nn.Sequential(
        #     nn.Linear(input_dim, hidden_dim), nn.Tanh(),
        #     nn.Linear(hidden_dim, hidden_dim), nn.Tanh(),
        #     nn.Linear(hidden_dim, 1), nn.Tanh()).to(device)

        self.trunk = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), nn.Tanh(),
            nn.Linear(hidden_dim, hidden_dim), nn.Tanh(),
            nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.optimizer = torch.optim.Adam(self.trunk.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())
Esempio n. 30
0
    def __init__(self,
                 venv,
                 num_models,
                 model_dir,
                 include_action,
                 num_layers,
                 embedding_dims,
                 ctrl_coeff=0.,
                 alive_bonus=0.):
        super().__init__(venv, num_models, model_dir, include_action,
                         num_layers, embedding_dims, ctrl_coeff, alive_bonus)

        self.rew_rms = [RunningMeanStd(shape=()) for _ in range(num_models)]
        self.cliprew = 10.
        self.epsilon = 1e-8
Esempio n. 31
0
def test_runningmeanstd():
    for (x1, x2, x3) in [
        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)),
    ]:
        rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])

        x = np.concatenate([x1, x2, x3], axis=0)
        ms1 = [x.mean(axis=0), x.var(axis=0)]
        rms.update(x1)
        rms.update(x2)
        rms.update(x3)
        ms2 = [rms.mean, rms.var]

        assert np.allclose(ms1, ms2)
Esempio n. 32
0
def test_runningmeanstd():
    """Test RunningMeanStd object"""
    for (x_1, x_2, x_3) in [
        (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
        (np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2))
    ]:
        rms = RunningMeanStd(epsilon=0.0, shape=x_1.shape[1:])

        x_cat = np.concatenate([x_1, x_2, x_3], axis=0)
        moments_1 = [x_cat.mean(axis=0), x_cat.var(axis=0)]
        rms.update(x_1)
        rms.update(x_2)
        rms.update(x_3)
        moments_2 = [rms.mean, rms.var]

        assert np.allclose(moments_1, moments_2)