コード例 #1
0
ファイル: vec_normalize.py プロジェクト: LanxinL/baselines
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8,
              use_tf=False):
     VecEnvWrapper.__init__(self, venv)
     if use_tf:
         from baselines.common.running_mean_std import TfRunningMeanStd
         self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                        scope='ob_rms') if ob else None
         self.ret_rms = TfRunningMeanStd(shape=(),
                                         scope='ret_rms') if ret else None
     else:
         from baselines.common.running_mean_std import RunningMeanStd
         self.ob_rms = RunningMeanStd(
             shape=self.observation_space.shape) if ob else None
         self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
     self.useReset0 = False if os.getenv(
         "useReset0") == "None" or os.getenv("useReset0") is None else eval(
             os.getenv('useReset0').capitalize())
     logger.log(" useReset0 is %s" % str(self.useReset0))
コード例 #2
0
    def __init__(self,
                 venv,
                 ob=True,
                 ret=True,
                 clipob=10.,
                 cliprew=10.,
                 gamma=0.99,
                 epsilon=1e-8):
        VecEnvWrapper.__init__(self, venv)
        try:
            self.num_agents = num_agents = len(self.observation_space)
            self.ob_rms = [
                RunningMeanStd(shape=self.observation_space[k].shape)
                for k in range(num_agents)
            ] if ob else None
        except:
            self.num_agents = num_agents = len(self.observation_space.spaces)
            self.ob_rms = [
                RunningMeanStd(shape=self.observation_space.spaces[k].shape)
                for k in range(num_agents)
            ] if ob else None

        self.ret_rms = RunningMeanStd(shape=()) if ret else None
        #[RunningMeanStd(shape=()) for k in range(num_agents)] if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        # self.ret = [np.zeros(self.num_envs) for _ in range(num_agents)]
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
コード例 #3
0
ファイル: vec_normalize.py プロジェクト: youngleox/nero
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8,
              use_tf=False):
     VecEnvWrapper.__init__(self, venv)
     if use_tf:
         from baselines.common.running_mean_std import TfRunningMeanStd
         self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape,
                                        scope='ob_rms') if ob else None
         self.ret_rms = TfRunningMeanStd(shape=(),
                                         scope='ret_rms') if ret else None
     else:
         from baselines.common.running_mean_std import RunningMeanStd
         self.ob_rms = RunningMeanStd(
             shape=self.observation_space.shape) if ob else None
         self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
コード例 #4
0
 def __init__(self,
              venv,
              ob=True,
              ret=True,
              train=True,
              noclip=False,
              has_timestep=False,
              ignore_mask=None,
              freeze_mask=None,
              time_scale=1e-3,
              clipob=10.,
              cliprew=10.,
              gamma=0.99,
              epsilon=1e-8):
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(
         shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.train = train
     self.gamma = gamma
     self.epsilon = epsilon
     self.noclip = noclip
     self.ignore_mask = ignore_mask
     self.freeze_mask = freeze_mask
     self.has_timestep = has_timestep
     self.time_scale = time_scale
コード例 #5
0
 def __init__(self,
              venv,
              norm_obs=True,
              norm_reward=True,
              clip_obs=10.,
              clip_reward=10.,
              gamma=0.99,
              epsilon=1e-8):
     """
     A rolling average, normalizing, vectorized wrapepr for environment base class
     
     :param venv: ([Gym Environment]) the list of environments to vectorize and normalize
     :param norm_obs: (bool) normalize observation
     :param norm_reward: (bool) normalize reward with discounting (r = sum(r_old) * gamma + r_new)
     :param clip_obs: (float) clipping value for nomalizing observation
     :param clip_reward: (float) clipping value for nomalizing reward
     :param gamma: (float) discount factor
     :param epsilon: (float) epsilon value to avoid arithmetic issues
     """
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(
         shape=self.observation_space.shape) if norm_obs else None
     self.ret_rms = RunningMeanStd(shape=()) if norm_reward else None
     self.clip_obs = clip_obs
     self.clip_reward = clip_reward
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
コード例 #6
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.placeholder_lr,
                                       comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.placeholder_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        if MPI.COMM_WORLD.Get_rank() == 0:
            tf.get_default_session().run(
                tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(
            tf.get_default_session(),
            tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.policy,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()
            if self.dynamics.dropout:
                self.rff2 = RewardForwardFilter(self.gamma)
                self.rff_rms2 = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
コード例 #7
0
 def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
     MTVecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
コード例 #8
0
    def __init__(self, env, clip_ob=10, clip_rew=10, epsilon=1e-8, gamma=0.99):
        super().__init__(env)
        self.clip_ob = clip_ob
        self.clip_rew = clip_rew
        self._reset_rew()
        self.gamma = gamma
        self.epsilon = epsilon

        self.ob_rms = RunningMeanStd(shape=self.observation_space.shape)
        self.ret_rms = RunningMeanStd(shape=())
コード例 #9
0
 def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, reward_scale=1., update=True):
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
     self.variables_name_save = ['clipob','cliprew','ret','gamma', 'epsilon'  ]
     self.reward_scale = reward_scale
     self.update = update
コード例 #10
0
 def __init__(self, venv, ob=True, ret=True, clipob=5., cliprew=5., ext_gamma=0.999, int_gamma=0.999, epsilon=1e-8):
     super(VecNormalize, self).__init__(venv)
     self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
     self.ext_ret_rms = RunningMeanStd(shape=()) if ret else None
     self.int_ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipobs = clipob
     self.cliprew = cliprew
     self.ext_ret = np.zeros(self.num_envs)
     self.ext_gamma = ext_gamma
     self.int_ret = np.zeros(self.num_envs)
     self.int_gamma = int_gamma
     self.epsilon = epsilon
コード例 #11
0
 def __init__(self, *, env, model, nsteps, gamma, lam):
     super().__init__(env=env, model=model, nsteps=nsteps)
     # Lambda used in GAE (General Advantage Estimation)
     self.lam = lam
     # Discount rate
     self.gamma = gamma
     self.clipob = 10.
     self.cliprew = 10.
     self.epsilon = 1e-8
     self.ret = 0
     self.ob_rms = RunningMeanStd(shape=self.env.observation_space.shape)
     self.ret_rms = RunningMeanStd(shape=())
コード例 #12
0
 def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
     VecEnv.__init__(self,
                     observation_space=venv.observation_space,
                     action_space=venv.action_space)
     print('bullet vec normalize 초기화 입니다. ')
     self.venv = venv
     self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(1)   # TODO, self.num_envs
     self.gamma = gamma
     self.epsilon = epsilon
コード例 #13
0
    def __init__(self, curiosity_program, reward_combiner_program,
                 curiosity_data_structure_values, curiosity_optimizer_values,
                 reward_combiner_data_structure_values,
                 reward_combiner_optimizer_values, envs, policy):
        self.curiosity_program = curiosity_program
        self.reward_combiner_program = reward_combiner_program
        self.curiosity_data_structure_values = curiosity_data_structure_values
        self.curiosity_optimizer_values = curiosity_optimizer_values
        self.reward_combiner_data_structure_values = reward_combiner_data_structure_values
        self.reward_combiner_optimizer_values = reward_combiner_optimizer_values

        self.envs = envs

        self.internal_reward_normalizer_all = mlca.helpers.statistics.welfords_std.Welford(
        )
        self.internal_reward_normalizer_window: List[int] = []

        # From https://github.com/openai/baselines/blob/master/baselines/common/vec_env/vec_normalize.py

        self.ret_rms = RunningMeanStd(shape=())
        self.clipob = 10.
        self.cliprew = 10.
        self.ret = np.zeros(TspParams.current().NUM_ROLLOUTS_PER_TRIAL)
        self.gamma = TspParams.current().DECAY_RATE
        assert self.gamma == .99
        self.epsilon = 1e-8
コード例 #14
0
    def __init__(self,
                 num_inputs,
                 input_size,
                 action_space,
                 hidden_size=64,
                 recurrent=False,
                 device='cpu'):
        super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size)

        self.device = device

        if recurrent:
            num_inputs = hidden_size

        init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                constant_(x, 0), np.sqrt(2))

        self.trunk = nn.Sequential(
            init__(nn.Linear(num_inputs + action_space.shape[0], hidden_size)),
            nn.Tanh(), init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(),
            init__(nn.Linear(hidden_size, 1)))

        self.optimizer = torch.optim.Adam(self.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.train()
コード例 #15
0
    def __init__(self, venv, pretrained_reward_net_path, chain_path,
                 embedding_dim, env_name):
        VecEnvWrapper.__init__(self, venv)
        self.reward_net = EmbeddingNet(embedding_dim)
        #load the pretrained weights
        self.reward_net.load_state_dict(torch.load(pretrained_reward_net_path))
        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        #load the mean of the MCMC chain
        burn = 5000
        skip = 20
        reader = open(chain_path)
        data = []
        for line in reader:
            parsed = line.strip().split(',')
            np_line = []
            for s in parsed[:-1]:
                np_line.append(float(s))
            data.append(np_line)
        data = np.array(data)
        #print(data[burn::skip,:].shape)

        #get average across chain and use it as the last layer in the network
        mean_weight = np.mean(data[burn::skip, :], axis=0)
        #print("mean weights", mean_weight[:-1])
        #print("mean bias", mean_weight[-1])
        #print(mean_weight.shape)
        self.reward_net.fc2 = nn.Linear(
            embedding_dim, 1, bias=False
        )  #last layer just outputs the scalar reward = w^T \phi(s)

        new_linear = torch.from_numpy(mean_weight)
        print("new linear", new_linear)
        print(new_linear.size())
        with torch.no_grad():
            #unsqueeze since nn.Linear wants a 2-d tensor for weights
            new_linear = new_linear.unsqueeze(0)
            #print("new linear", new_linear)
            #print("new bias", new_bias)
            with torch.no_grad():
                #print(last_layer.weight)
                #print(last_layer.bias)
                #print(last_layer.weight.data)
                #print(last_layer.bias.data)
                self.reward_net.fc2.weight.data = new_linear.float().to(
                    self.device)

            #TODO: print out last layer to make sure it stuck...
            print("USING MEAN WEIGHTS FROM MCMC")
            #with torch.no_grad():
            #    for param in self.reward_net.fc2.parameters():
            #        print(param)

        self.reward_net.to(self.device)

        self.rew_rms = RunningMeanStd(shape=())
        self.epsilon = 1e-8
        self.cliprew = 10.
        self.env_name = env_name
コード例 #16
0
    def __init__(self, device=None, envs=None, ensemble_policy=None, env_name=None,
        expert_dataset=None, ensemble_size=None, ensemble_quantile_threshold=None,
        dril_bc_model=None, dril_cost_clip=None, num_dril_bc_train_epoch=None,\
        training_data_split=None):

        self.ensemble_quantile_threshold = ensemble_quantile_threshold
        self.dril_cost_clip = dril_cost_clip
        self.device = device
        self.num_dril_bc_train_epoch = num_dril_bc_train_epoch
        self.env_name = env_name
        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())
        self.observation_space = envs.observation_space

        if envs.action_space.__class__.__name__ == "Discrete":
            self.num_actions = envs.action_space.n
        elif envs.action_space.__class__.__name__ == "Box":
            self.num_actions = envs.action_space.shape[0]
        elif envs.action_space.__class__.__name__ == "MultiBinary":
            self.num_actions = envs.action_space.shape[0]

        self.ensemble_size = ensemble_size
        # use full data since we don't use a validation set
        self.trdata = expert_dataset.load_demo_data(
            1.0, 1, self.ensemble_size)['trdata']

        self.ensemble = ensemble_policy
        self.bc = dril_bc_model
        self.bc.num_batches = num_dril_bc_train_epoch
        self.clip_variance = self.policy_variance(envs=envs)
コード例 #17
0
    def __init__(self, env, model, nsteps, icm, gamma, curiosity):
        super().__init__(env=env, model=model, nsteps=nsteps, icm=icm)
        assert isinstance(
            env.action_space, spaces.Discrete
        ), 'This ACER implementation works only with discrete action spaces!'
        assert isinstance(env, VecFrameStack)

        self.nact = env.action_space.n
        nenv = self.nenv
        self.nbatch = nenv * nsteps
        self.batch_ob_shape = (nenv *
                               (nsteps + 1), ) + env.observation_space.shape

        self.curiosity = curiosity

        self.obs = env.reset()
        self.obs_dtype = env.observation_space.dtype
        self.ac_dtype = env.action_space.dtype
        self.nstack = self.env.nstack
        self.nc = self.batch_ob_shape[-1] // self.nstack
        self.rff = RewardForwardFilter(gamma)
        self.rff_rms = RunningMeanStd()

        # print(" What is NC " , self.nc)
        print(" State of curiosity : ", icm)
コード例 #18
0
    def __init__(self,
                 input_dim,
                 hidden_dim,
                 device,
                 red=None,
                 sail=False,
                 learn=True):
        super(Discriminator, self).__init__()

        self.device = device

        self.red = red
        self.sail = sail
        self.redtrained = False
        if self.sail:
            assert self.red is not None, 'Cannot run SAIL without using RED'

        self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.learn = learn
        self.optimizer = torch.optim.Adam(self.trunk.parameters())

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())
コード例 #19
0
ファイル: cor_gail.py プロジェクト: azarafrooz/FTNPL
    def __init__(self,
                 input_dim,
                 action_dim,
                 hidden_size=100,
                 embed_size=0,
                 base=None,
                 base_kwargs=None,
                 device='cpu'):
        super(CorDiscriminator, self).__init__()
        if base_kwargs is None:
            base_kwargs = {}
        if base is None:
            if len(input_dim) == 3:
                base = CNNBase
            elif len(input_dim) == 1:
                base = MLPBase
            else:
                raise NotImplementedError

        self.base = base(input_dim[0],
                         input_dim[1:],
                         action_dim,
                         hidden_size,
                         embed_size,
                         device=device,
                         **base_kwargs)
        self.parameters = self.base.parameters()
        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())
コード例 #20
0
ファイル: cor_gail.py プロジェクト: azarafrooz/FTNPL
    def __init__(self,
                 num_inputs,
                 input_size,
                 action_space,
                 hidden_size=64,
                 embed_size=0,
                 recurrent=False,
                 device='cpu'):
        super(MLPBase, self).__init__(recurrent, num_inputs, hidden_size,
                                      embed_size)

        self.device = device

        if recurrent:
            num_inputs = hidden_size

        init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                constant_(x, 0), np.sqrt(2))

        self.trunk = nn.Sequential(
            init__(
                nn.Linear(num_inputs + action_space.shape[0] + embed_size,
                          hidden_size)), nn.Tanh(),
            init__(nn.Linear(hidden_size, hidden_size)), nn.Tanh(),
            init__(nn.Linear(hidden_size, 1)))

        # self.optimizer = torch.optim.Adam(self.parameters(), lr= 3e-5)
        self.optimizer = torch.optim.RMSprop(self.parameters(), lr=5e-5)

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.train()
コード例 #21
0
    def __init__(self,
                 input_dim,
                 hidden_dim,
                 device,
                 gail_reward_type=None,
                 clip_gail_action=None,
                 envs=None,
                 disc_lr=None):
        super(Discriminator, self).__init__()

        self.device = device

        self.trunk = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim,
                                             hidden_dim), nn.Tanh(),
                                   nn.Linear(hidden_dim, 1)).to(device)

        self.trunk.train()

        self.optimizer = torch.optim.Adam(self.trunk.parameters(), lr=disc_lr)

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        self.reward_type = gail_reward_type
        self.clip_gail_action = clip_gail_action
        self.action_space = envs.action_space
コード例 #22
0
    def __init__(self, venv, model_dir, ctrl_coeff=0., alive_bonus=0.):
        super().__init__(venv, model_dir, ctrl_coeff, alive_bonus)

        self.rew_rms = [
            RunningMeanStd(shape=()) for _ in range(len(self.models))
        ]
        self.cliprew = 100.
        self.epsilon = 1e-8
コード例 #23
0
ファイル: utils.py プロジェクト: jsztompka/PPO-demo
 def __call__(self, x):
     x = np.asarray(x)
     if self.rms is None:
         self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:])
     if not self.read_only:
         self.rms.update(x)
     return np.clip((x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon),
                    -self.clip, self.clip)
コード例 #24
0
ファイル: cppo_agent.py プロジェクト: ijcai-261/ijcai-261
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))
        
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        self.caculate_number_parameters(params)

        flow_params = [v for v in params if 'flow' in v.name]
        other_params = [v for v in params if 'flow' not in v.name]

        print('length of flow params: ', len(flow_params))
        print('length of agent params: ', len(other_params))
        
        trainer_flow = tf.train.AdamOptimizer(learning_rate=self.flow_lr)
        trainer_agent = tf.train.AdamOptimizer(learning_rate=self.ph_lr)

        grads = tf.gradients(self.total_loss, flow_params + other_params)
        grads_flow = grads[:len(flow_params)]
        grads_agent = grads[len(flow_params):]

        train_flow = trainer_flow.apply_gradients(zip(grads_flow, flow_params))
        train_agent = trainer_agent.apply_gradients(zip(grads_agent, other_params))

        self._train = tf.group(train_flow, train_agent)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for
            l in range(self.nlump)]

        self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
コード例 #25
0
    def __init__(self, envs, size_obs_to_norm=13, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.95, epsilon=1e-8, use_tf=False):
        self.envs = envs
        self.size_obs_to_norm = size_obs_to_norm

        if use_tf:
            from baselines.common.running_mean_std import TfRunningMeanStd
            self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None
            self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None

        else:
            from baselines.common.running_mean_std import RunningMeanStd
            self.ob_rms = RunningMeanStd(shape=(size_obs_to_norm,)) if ob else None
            self.ret_rms = RunningMeanStd(shape=()) if ret else None
        self.clipob = clipob
        self.cliprew = cliprew
        self.ret = np.zeros(self.num_envs)
        self.gamma = gamma
        self.epsilon = epsilon
コード例 #26
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv*nsteps
        ob_shape = (nbatch, ob_space.shape[0]*nstack)
        nact = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape) #obs
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("obfilter", reuse=reuse):
            self.ob_rms = RunningMeanStd(shape=ob_shape[1:])
        with tf.variable_scope("retfilter", reuse=reuse):
            self.ret_rms = RunningMeanStd(shape=(1,))

        obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #obz = X

        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0))

            mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0)
            logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer())
            logstd = tf.expand_dims(logstd, 0)
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0)

        v0 = vf[:, 0]
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self.initial_state = [] #not stateful

        def step(stoch, ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {stochastic:stoch, X:ob})
            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X:ob})

        self.X = X
        self.vf = vf
        self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std
        self.step = step
        self.value = value
コード例 #27
0
ファイル: cor_gail.py プロジェクト: azarafrooz/FTNPL
    def __init__(self,
                 num_inputs,
                 input_size,
                 action_space,
                 hidden_size=512,
                 embed_size=0,
                 recurrent=False,
                 device='cpu'):

        super(CNNBase, self).__init__(recurrent, num_inputs, hidden_size,
                                      embed_size)

        self.device = device
        self.action_space = action_space

        h, w = input_size
        self.conv1 = nn.Conv2d(num_inputs, 32, kernel_size=8, stride=4)
        w_out = conv2d_size_out(w, kernel_size=8, stride=4)
        h_out = conv2d_size_out(h, kernel_size=8, stride=4)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        w_out = conv2d_size_out(w_out, kernel_size=4, stride=2)
        h_out = conv2d_size_out(h_out, kernel_size=4, stride=2)

        self.conv3 = nn.Conv2d(64, 32, kernel_size=3, stride=1)
        w_out = conv2d_size_out(w_out, kernel_size=3, stride=1)
        h_out = conv2d_size_out(h_out, kernel_size=3, stride=1)

        init_cnn_ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                   constant_(x, 0),
                                   nn.init.calculate_gain('relu'))

        self.cnn_trunk = nn.Sequential(
            init_cnn_(self.conv1), nn.ReLU(), init_cnn_(self.conv2), nn.ReLU(),
            init_cnn_(self.conv3), nn.ReLU(), Flatten(),
            init_cnn_(nn.Linear(32 * h_out * w_out, hidden_size)), nn.ReLU())

        init__ = lambda m: init(m, nn.init.orthogonal_, lambda x: nn.init.
                                constant_(x, 0), np.sqrt(2))

        self.trunk = nn.Sequential(
            init__(
                nn.Linear(hidden_size + self.action_space.n + embed_size,
                          hidden_size // 2)), nn.Tanh(),
            init__(nn.Linear(hidden_size // 2, hidden_size // 2)), nn.Tanh(),
            init__(nn.Linear(hidden_size // 2, 1)))

        # self.optimizer = torch.optim.Adam(self.parameters(), lr=3e-5)
        self.optimizer = torch.optim.RMSprop(
            self.parameters(), lr=5e-5
        )  # To be conistent with the wgan optimizer, althougt not necessary

        self.returns = None
        self.ret_rms = RunningMeanStd(shape=())

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
コード例 #28
0
ファイル: normalizer.py プロジェクト: jieli18/SA_DDPG
 def __call__(self, x):
     from baselines.common.running_mean_std import RunningMeanStd
     x = np.asarray(x)
     if self.rms is None:
         self.rms = RunningMeanStd(shape=(1, ) + x.shape[1:])
     if not self.read_only:
         self.rms.update(x)
     return np.clip(
         (x - self.rms.mean) / np.sqrt(self.rms.var + self.epsilon),
         -self.clip, self.clip)
コード例 #29
0
ファイル: vec_normalize.py プロジェクト: rgalljamov/deephop
 def __init__(
     self,
     venv,
     ob=False,
     ret=False,
     clipob=10.,
     cliprew=10.,
     gamma=0.99,
     epsilon=1e-8
 ):  # Akhil: add running mean and variance here so the correct mean and var can be inputted here when a model is loaded!
     VecEnvWrapper.__init__(self, venv)
     self.ob_rms = RunningMeanStd(
         shape=self.observation_space.shape) if ob else None
     self.ret_rms = RunningMeanStd(shape=()) if ret else None
     self.clipob = clipob
     self.cliprew = cliprew
     self.ret = np.zeros(self.num_envs)
     self.gamma = gamma
     self.epsilon = epsilon
コード例 #30
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        # 在开始与环境交互时定义变量和计算图, 初始化 rollout 类
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        # 定义损失、梯度和反向传播.  在训练时调用 sess.run(self._train) 进行迭代
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        params_dvae = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="dvae_reward")
        print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in params]))      # 6629459
        print("dvae params:", np.sum([np.prod(v.get_shape().as_list()) for v in params_dvae]))  # 2726144
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        # add bai.  单独计算 DVAE 的梯度
        gradsandvars_dvae = trainer.compute_gradients(self.dynamics_loss, params_dvae)
        self._train_dvae = trainer.apply_gradients(gradsandvars_dvae)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)        # 默认 128
        self.nlump = nlump                       # 默认 1
        self.lump_stride = nenvs // self.nlump   # 128/1=128
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for
            l in range(self.nlump)]

        # 该类在 rollouts.py 中定义
        self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        # 环境数(线程数), 周期T
        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()