コード例 #1
0
    def grad_logp(self, obs, action):
        x = to_tensor(obs)
        actions = to_tensor(action)

        dist = self(x)
        logp = dist.log_prob(actions)
        grads = []
        lp = logp.sum(dim=-1)
        if lp.shape[0] == 1:
            self.zero_grad()
            lp[0].backward()
            grads.append(get_flat_grad(self))
        else:
            for i in range(lp.shape[0]):
                self.zero_grad()
                if i == lp.shape[0] - 1:
                    lp[i].backward(retain_graph=False)
                else:
                    lp[i].backward(retain_graph=True)
                grads.append(get_flat_grad(self))

        grad = np.array(grads, dtype=np.float64)
        logp = to_numpy(lp).reshape(-1, 1)

        return logp, grad
コード例 #2
0
ファイル: ppo.py プロジェクト: ScottJordan/python-rllib
    def update(self, state, act, blogp, reward, next_state, terminal):
        '''
        update algorithm for both value function and policy
        :param state: tensor of current states (BatchsizeXstate_dims)
        :param act: tensor for action taken (BatchsizeXActdim)
        :param blogp:  of log probabilities (BatchsizeX1)
        :param reward: reward at time t (BatchsizeX1)
        :param next_state: tensor of next states (BatchsizeXstate_dims)
        :param terminal: bool for end of episode
        :return:
        '''

        self.states.append(to_tensor(state.astype(np.float32)))
        self.rewards.append(to_tensor(np.array([reward], dtype=np.float32)))
        self.actions.append(to_tensor(np.array([act], dtype=np.float32)))
        self.blogps.append(to_tensor(np.array([blogp], dtype=np.float32)))
        self.terminals.append(
            to_tensor(np.array([float(terminal)], dtype=np.float32)))
        v = self.vf.predict(state).detach()
        self.vals.append(v)

        if len(self.states) >= self.step_batch:
            self.vals.append(torch.zeros_like(v))
            err = self.run_update()
            self.states = []
            self.rewards = []
            self.actions = []
            self.blogps = []
            self.terminals = []
            self.vals = []
        else:
            err = [0., 0.]

        return err
コード例 #3
0
ファイル: network.py プロジェクト: ScottJordan/python-rllib
 def predict(self, obs):
     if not torch.is_tensor(obs):
         if isinstance(obs,list):
             obs = [to_tensor(i) for i in obs]
         else:
             obs = to_tensor(obs)
     v = self(obs)
     return v
コード例 #4
0
ファイル: network.py プロジェクト: ScottJordan/python-rllib
 def predict(self, obs:Tuple):
     obs, acts = obs
     if not torch.is_tensor(obs):
         if isinstance(obs,list):
             obs = [to_tensor(i) for i in obs]
         else:
             obs = to_tensor(obs)
     q = self(obs)
     # TODO I do not think this work for batch actions
     if acts is not None:
         return q[:, acts]
     else:
         return q
コード例 #5
0
    def get_action(self, obs, stochastic=True):
        if self.numpy_action:
            if not isinstance(obs, torch.Tensor):
                with torch.no_grad():
                    obs = to_tensor(obs)

            return self.get_action_numpy(obs, stochastic)
        else:
            if not isinstance(obs, torch.Tensor):
                if isinstance(obs, list):
                    obs = [to_tensor(ob) for ob in obs]
                else:
                    obs = to_tensor(obs)
            return self.get_action_torch(obs, stochastic)
コード例 #6
0
    def get_action_numpy(self, obs, stochastic=True):
        x = obs
        with torch.no_grad():
            x = to_tensor(x).float()
            dist = self(x)

        if stochastic:
            #a = np.array([np.random.normal(loc=mu[i, :], scale=std[i, :]) for i in range(mu.shape[0])]).reshape(-1, mu.shape[1])
            action = dist.sample()
            logp = to_numpy(dist.log_prob(action).sum(dim=-1))
            a = to_numpy(action)
        else:
            if isinstance(dist, D.Categorical):
                a = np.argmax(to_numpy(dist.probs))
            elif isinstance(dist, D.Normal):
                a = to_numpy(dist.loc)
            else:
                raise Exception("dist type not recognized: " + str(type(dist)))
            logp = np.zeros_like(a).sum(axis=-1)

        if len(logp.shape) == 0:
            logp = np.array([logp])
        if len(a.shape) == 0:
            a = np.array([a])

        return a, logp
コード例 #7
0
    def __init__(self, obs_space, dims, act_fn=nn.LeakyReLU(), ranges=None):
        super(MLPBase, self).__init__()
        self.obs_space = obs_space
        self.dims = dims
        self.act_fn = act_fn

        in_dim = obs_space.low.shape[0]
        in_dims = [in_dim] + dims[:-1]
        out_dims = dims
        self.ranges = ranges
        if ranges is not None:  # scales input to be in 1/max(|x|) for each dim.
            # self.feat_range = (ranges[:, 1] - ranges[:, 0]).astype(np.float64) # scales to be [0,1]
            self.feat_range = np.abs(ranges).max(axis=1)
            self.feat_range[self.feat_range == 0] = 1
            self.feat_range = to_tensor(self.feat_range, requires_grad=False)
            self.ranges = to_tensor(self.ranges, requires_grad=False)
        self.layers = nn.ModuleList(
            nn.Linear(idim, odim) for idim, odim in zip(in_dims, out_dims))
コード例 #8
0
ファイル: network.py プロジェクト: ScottJordan/python-rllib
 def predict(self, obs:Tuple):
     obs, acts = obs
     if not torch.is_tensor(obs):
         obs = to_tensor(obs)
     q = self(obs)
     #TODO I do not think this work for batch actions
     if acts is not None:
         return q[:, acts]
     else:
         return q
コード例 #9
0
ファイル: ppo.py プロジェクト: ScottJordan/python-rllib
    def act(self, env, stochastic=True, train=True):

        state = env.state.astype(np.float32)
        act, blogp = self.policy.get_action(state, stochastic)
        env.step(act)
        reward = env.reward
        terminal = env.done

        self.states.append(to_tensor(state.astype(
            np.float32)))  # store state before action
        self.rewards.append(to_tensor(np.array(
            [reward],
            dtype=np.float32)))  # store reward for taking action in state
        self.actions.append(to_tensor(np.array(
            [act], dtype=np.float32)))  # store action taken
        self.blogps.append(to_tensor(np.array(
            [blogp],
            dtype=np.float32)))  # store log probability of the action taken
        self.terminals.append(
            to_tensor(np.array([float(terminal)], dtype=np.float32))
        )  # store whether or not the action resulted in a terminal state
        v = self.vf.predict(state).detach()
        self.vals.append(
            v
        )  # store the value function estimate at the state after taking the action

        if len(
                self.states
        ) >= self.step_batch:  # if buffer size if greater than the T update the policy and value function
            self.vals.append(torch.zeros_like(v))
            err = self.run_update()  # run update function

            # clear buffer
            self.states = []
            self.rewards = []
            self.actions = []
            self.blogps = []
            self.terminals = []
            self.vals = []
        else:
            err = [0., 0.]

        return err
コード例 #10
0
ファイル: network.py プロジェクト: ScottJordan/python-rllib
 def predict(self, obs):
     if not torch.is_tensor(obs):
         obs = to_tensor(obs)
     v = self(obs)
     return v