Ejemplo n.º 1
0
    def f(w):
        reward_total = 0
        reps = 1
        vector_to_parameters(torch.from_numpy(w).float(), policy.parameters())

        for i in range(reps):
            reward = 0
            done = False
            obs = env.reset()

            h_0 = policy.init_hidden()
            while not done:

                # Get action from policy
                with torch.no_grad():
                    act, h_1 = policy((my_utils.to_tensor(obs, True), h_0))

                # Step environment
                act = act.squeeze(0).numpy()
                #act = np.array([-1,0])
                obs, rew, done, _ = env.step(act)

                if animate:
                    env.render()

                reward += rew

                h_0 = h_1

            reward_total += reward

        return - (reward_total) / reps
    def gradient_ascent_step(self):
        """Makes one update of policy weights"""
        
        # get loss
        loss = self.surrogate_function(write_to_log=True)
        
        # calculating gradient
        self.policy.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        policy_gradient = parameters_to_vector([v.grad for v in self.policy.parameters()]).squeeze(0)        
        assert policy_gradient.nonzero().size()[0] > 0, "Policy gradient is 0. Skipping update?.."
        
        # Use conjugate gradient algorithm to determine the step direction in theta space
        step_direction = self.conjugate_gradient(-policy_gradient.cpu().numpy())

        # Do line search to determine the stepsize of theta in the direction of step_direction
        shs = step_direction.dot(self.hessian_vector_product(Tensor(step_direction)).cpu().numpy().T) / 2
        lm = np.sqrt(shs / self.config.max_kl)
        fullstep = step_direction / lm
        gdotstepdir = -policy_gradient.dot(Tensor(step_direction)).data[0]
        theta = self.linesearch(parameters_to_vector(self.policy.parameters()), fullstep, gdotstepdir / lm)

        # Update parameters of policy model
        if any(np.isnan(theta.data.cpu().numpy())):
          raise Exception("NaN detected. Skipping update...")
        else:
          vector_to_parameters(theta, self.policy.parameters())

        kl_old_new = self.mean_kl_divergence()
        self.logger["kl_change"].append(kl_old_new.item())
Ejemplo n.º 3
0
    def line_search(self, gradients, states, actions, log_probs, rewards):
        step_size = (2 * self.kl_delta / gradients.dot(
            self.fisher_vector_direct(gradients, states))).sqrt()
        step_size_decay = 1.5
        line_search_attempts = 10

        # New policy
        current_parameters = parameters_to_vector(self.policy.parameters())
        new_policy = deepcopy(self.policy)
        vector_to_parameters(current_parameters + step_size * gradients,
                             new_policy.parameters())
        new_std = self.logstd.detach() + step_size * self.logstd.grad

        #  Shrink gradient until KL constraint met and improvement
        for attempt in range(line_search_attempts):
            # Obtain kl divergence and objective
            with torch.no_grad():
                kl_value = self.kl(new_policy, new_std, states)
                objective = self.surrogate_objective(new_policy, new_std,
                                                     states, actions,
                                                     log_probs, rewards)

            # Shrink gradient if KL constraint not met or reward lower
            if kl_value > self.kl_delta or objective < 0:
                step_size /= step_size_decay
                vector_to_parameters(
                    current_parameters + step_size * gradients,
                    new_policy.parameters())
                new_std = self.logstd.detach() + step_size * self.logstd.grad
            #  Return new policy and std if KL and reward met
            else:
                return new_policy, new_std.requires_grad_()

        # Return old policy and std if constraints never met
        return self.policy, self.logstd
Ejemplo n.º 4
0
def train(env, policy, config):
    w = parameters_to_vector(policy.parameters()).detach().numpy()
    es = cma.CMAEvolutionStrategy(w, config["cma_std"])
    f = f_wrapper(env, policy)

    sdir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                        f'agents/{config["session_ID"]}_ES_policy.p')

    print(f'N_params: {len(w)}')

    it = 0
    try:
        while not es.stop():
            it += 1
            if it > config["iters"]:
                break
            X = es.ask()
            es.tell(X, [f(x) for x in X])
            es.disp()

    except KeyboardInterrupt:
        print("User interrupted process.")

    vector_to_parameters(
        torch.from_numpy(es.result.xbest).float(), policy.parameters())
    T.save(policy.state_dict(), sdir)
    print("Saved agent, {}".format(sdir))

    return es.result.fbest
    def surrogate_loss(self, theta):
        """
        Returns the surrogate loss w.r.t. the given parameter vector theta
        """
        theta = theta.detach()
        new_model = copy.deepcopy(self.model)
        # for param in new_model.parameters():
        #     print(param)
        vector_to_parameters(theta, new_model.policy_parameters())

        if self.continuous:
            mean_new, std_new, _ = new_model(self.obs)
            mean_old, std_old, _ = self.model(self.obs)

            dis_new = Normal(mean_new, std_new)
            dis_old = Normal(mean_old, std_old)

            log_prob_new = dis_new.log_prob(self.acts).sum(-1, keepdim=True)
            log_prob_old = dis_old.log_prob(self.acts).sum(-1, keepdim=True)

            ratio = torch.exp(log_prob_new - log_prob_old).detach()
        else:

            probs_new, _ = new_model(self.obs)
            probs_old, _ = self.model(self.obs)

            dis_new = F.softmax(probs_new, dim=1)
            dis_old = F.softmax(probs_old, dim=1)

            probs_new = dis_new.gather(1, self.acts).detach()
            probs_old = dis_old.gather(1, self.acts).detach() + 1e-8

            ratio = probs_new / probs_old

        return -torch.mean(ratio * self.advs)
Ejemplo n.º 6
0
    def learn_htrpo(self):
        b_t = time.time()
        self.sample_batch()
        self.split_episode()
        # No valid episode is collected
        if self.n_valid_ep == 0:
            return
        self.generate_subgoals()
        if not self.using_original_data:
            self.reset_training_data()
        if self.sampled_goal_num is None or self.sampled_goal_num > 0:
            self.generate_fake_data()
        self.data_preprocess()
        self.other_data = self.goal

        # Optimize Value Estimator
        self.estimate_value()
        if self.value_type is not None:
            # update value
            for i in range(self.iters_v):
                self.update_value()

        # Optimize Policy
        # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0)
        # Likelihood Ratio
        # self.estimate_value()
        imp_fac = self.compute_imp_fac()

        if self.value_type:
            # old value estimator
            self.A = self.gamma_discount * self.hratio * self.A
        else:
            self.A = self.gamma_discount * self.A

        # Here mean() and sum() / self.n_traj is equivalent, because there
        # is only a coefficient between two expressions. This coefficient
        # will be compensated by the stepsize computation in TRPO. However,
        # in vanilla PG, there is no compensation, therefore, it needs to
        # be in the exact form of the euqation in the paper.
        self.loss = - (imp_fac * self.A).mean() - self.entropy_weight * self.compute_entropy()

        self.policy.zero_grad()
        loss_grad = torch.autograd.grad(
            self.loss, self.policy.parameters(), create_graph=True)
        # loss_grad_vector is a 1-D Variable including all parameters in self.policy
        loss_grad_vector = parameters_to_vector([grad for grad in loss_grad])
        # solve Ax = -g, A is Hessian Matrix of KL divergence
        trpo_grad_direc = self.conjunction_gradient(- loss_grad_vector)
        shs = .5 * torch.sum(trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc))
        beta = torch.sqrt(self.max_kl / shs)
        fullstep = trpo_grad_direc * beta
        gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc)
        theta = self.linear_search(parameters_to_vector(
            self.policy.parameters()), fullstep, gdotstepdir * beta)
        vector_to_parameters(theta, self.policy.parameters())
        self.learn_step_counter += 1
        self.cur_kl = self.mean_kl_divergence().item()
        self.policy_ent = self.compute_entropy().item()
        self.update_normalizer()
        print("iteration time:   {:.4f}".format(time.time()-b_t))
Ejemplo n.º 7
0
def train(params):
    env_fun, iters, animate, camera, _ = params

    env = env_fun(animate=animate, camera=camera)
    obs_dim, act_dim = env.obs_dim, env.act_dim
    policy = NN(obs_dim, act_dim).float()
    w = parameters_to_vector(policy.parameters()).detach().numpy()
    es = cma.CMAEvolutionStrategy(w, 0.5)
    f = f_wrapper(env, policy, animate)

    print(
        "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..."
        .format(env_fun.__name__, act_dim, obs_dim, len(w)))
    it = 0
    try:
        while not es.stop():
            it += 1
            if it > iters:
                break
            if it % 1000 == 0:
                sdir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "agents/{}.p".format(env_fun.__name__))
                vector_to_parameters(
                    torch.from_numpy(es.result.xbest).float(),
                    policy.parameters())
                T.save(policy, sdir)
                print("Saved checkpoint")
            X = es.ask()
            es.tell(X, [f(x) for x in X])
            es.disp()
    except KeyboardInterrupt:
        print("User interrupted process.")

    return es.result.fbest
Ejemplo n.º 8
0
def train_mt(params):
    env_fun, iters, animate, camera, model = params
    env = env_fun(animate=False, camera=camera)
    obs_dim, act_dim = env.obs_dim, env.act_dim

    policy = NN(obs_dim, act_dim).float()
    w = parameters_to_vector(policy.parameters()).detach().numpy()
    es = cma.CMAEvolutionStrategy(w, 0.5)

    print(
        "Env: {} Action space: {}, observation space: {}, N_params: {}, comments: ..."
        .format("Ant_reach", act_dim, obs_dim, len(w)))

    sims = [mujoco_py.MjSim(model) for _ in range(es.popsize)]
    policies = [policy] * es.popsize

    ctr = 0
    try:
        while not es.stop():
            ctr += 1
            if ctr > iters:
                break
            if ctr % 1000 == 0:
                sdir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "agents/{}.p".format(env_fun.__name__))
                vector_to_parameters(
                    torch.from_numpy(es.result.xbest).float(),
                    policy.parameters())
                T.save(policy, sdir)
                print("Saved checkpoint")
            X = es.ask()

            output = mp.Queue()
            processes = []
            for i, ef, sim, policy, x in zip(range(es.popsize),
                                             [env_fun] * es.popsize, sims,
                                             policies, X):
                processes.append(
                    mp.Process(target=f_mp,
                               args=(i, ef, sim, policy, x, output)))

            # Run processes
            for p in processes:
                p.start()

            # Exit the completed processes
            for p in processes:
                p.join()

            evals = [output.get() for _ in processes]
            evals.sort(key=lambda x: x[0])
            evals = [ev[1] for ev in evals]

            es.tell(X, evals)
            es.disp()
    except KeyboardInterrupt:
        print("User interrupted process.")

    return es.result.fbest
 def linesearch(self, x, fullstep, expected_improve_rate):
     """
     Returns the parameter vector given by a linesearch
     input: x - Tensor, 1D, current parameters
     input: fullstep - Tensor, 1D, direction (natural gradient), normalized
     input: expected_improve_rate - ?!?
     output: new parameters - Tensor, 1D
     """
     accept_ratio = 0.1
     max_backtracks = 10
     with torch.no_grad():
         fval = self.surrogate_function().mean()
         
     for (_n_backtracks, stepfrac) in enumerate(0.5**np.arange(max_backtracks)):
         xnew = x.data.cpu().numpy() + stepfrac * fullstep
         vector_to_parameters(Tensor(xnew), self.policy.parameters())
         with torch.no_grad():
             newfval = self.surrogate_function().mean()
         actual_improve = fval - newfval
         expected_improve = expected_improve_rate * stepfrac
         ratio = actual_improve / expected_improve
         #print(actual_improve, " ", expected_improve)
         if ratio > accept_ratio and actual_improve > 0:
             #print("Accepted")
             self.logger["acceptance_ratio"].append(ratio)
             self.logger["expected_improvement"].append(expected_improve_rate)
             return Tensor(xnew)
     
     raise Exception("Line search error")
     return x   
 def learn(self):
     self.sample_batch()
     # imp_fac: should be a 1-D Variable or Tensor, size is the same with a.size(0)
     imp_fac = self.compute_imp_fac()
     self.estimate_value()
     self.A = (self.A - self.A.mean()) / (self.A.std() + 1e-8)
     self.loss = -(imp_fac * self.A
                   ).mean() - self.entropy_weight * self.compute_entropy()
     if self.value_type is not None:
         # update value
         for i in range(self.iters_v):
             self.update_value()
     self.policy.zero_grad()
     loss_grad = torch.autograd.grad(self.loss,
                                     self.policy.parameters(),
                                     create_graph=True)
     # loss_grad_vector is a 1-D Variable including all parameters in self.policy
     loss_grad_vector = parameters_to_vector([grad for grad in loss_grad])
     # solve Ax = -g, A is Hessian Matrix of KL divergence
     trpo_grad_direc = self.conjunction_gradient(-loss_grad_vector)
     shs = .5 * torch.sum(
         trpo_grad_direc * self.hessian_vector_product(trpo_grad_direc))
     beta = torch.sqrt(self.max_kl / shs)
     fullstep = trpo_grad_direc * beta
     gdotstepdir = -torch.sum(loss_grad_vector * trpo_grad_direc)
     theta = self.linear_search(
         parameters_to_vector(self.policy.parameters()), fullstep,
         gdotstepdir * beta)
     # update policy
     vector_to_parameters(theta, self.policy.parameters())
     self.learn_step_counter += 1
     self.cur_kl = self.mean_kl_divergence().item()
     self.policy_ent = self.compute_entropy().item()
    def step(self,
             episodes,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        """Meta-optimization step (ie. update of the initial parameters), based 
        on Trust Region Policy Optimization (TRPO, [4]).
        """
        old_loss, _, old_pis = self.surrogate_loss(episodes)
        grads = torch.autograd.grad(old_loss, self.policy.parameters())
        grads = parameters_to_vector(grads)

        step = grads / torch.norm(grads)

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())
            loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis)
            improve = loss - old_loss
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, self.policy.parameters())
Ejemplo n.º 12
0
    def f(w):
        rewards = []
        done = False
        obs, _ = env.reset()

        vector_to_parameters(torch.from_numpy(w).float(), policy.parameters())

        while not done:

            # Get action from policy
            with torch.no_grad():
                act = policy(my_utils.to_tensor(obs, True))

            # Step environment
            obs, rew, done, od = env.step(act.squeeze(0).numpy())

            if animate:
                env.render()

            rewards.append(od['rV'])

        r = 0
        for rew in rewards:
            rew_arr = np.array(rew)
            r += rew_arr.sum() - np.abs(rew_arr - rew_arr.mean()).mean()

        return -r
Ejemplo n.º 13
0
 def get_loss(self, theta, b_s, b_a, advantage):
     # get surrogate loss
     prob_old = self._policy(b_s).gather(1, b_a).data
     new_model = copy.deepcopy(self._policy)
     vector_to_parameters(theta, new_model.parameters())
     prob_new = new_model(b_s).gather(1, b_a).data
     return -(prob_new / prob_old * advantage).mean()
Ejemplo n.º 14
0
    def line_search_v2(self, theta):
        '''
		line search to return the parameter vector
		'''
        old_loss = self.surrogate_loss(theta)
        old_loss = Variable(old_loss, requires_grad=True)
        params = torch.cat(
            [param.view(-1) for param in self.pg_model.parameters()])
        old_loss.backward(params)
        old_loss_grad = old_loss.grad
        s = self.conjugate_gradient(old_loss_grad)

        s = torch.from_numpy(s).float()
        beta = torch.sqrt(2 * self.delta / (s * old_loss_grad).sum())

        beta_end = 0
        decay = 100
        alpha = 0.1
        for d in range(decay):
            beta = beta * math.exp(-alpha * d)  # shrink exponentially
            theta_new = theta + beta * s

            # compute objective
            new_loss = self.surrogate_loss(theta_new)

            new_model = deepcopy(self.pg_model)
            vector_to_parameters(theta_new, new_model.parameters())

            mean_kl, _, _ = self.get_mean_kl_divergence(new_model)

            if mean_kl <= self.delta and new_loss < old_loss:  # objective improve
                return theta_new
        return theta
Ejemplo n.º 15
0
def train(params):
    env, policy, iters, animate, ID = params

    w = parameters_to_vector(policy.parameters()).detach().numpy()
    es = cma.CMAEvolutionStrategy(w, 0.9)
    f = f_wrapper(env, policy, animate)

    it = 0
    try:
        while not es.stop():
            it += 1
            if it > iters:
                break
            if it % 30 == 0:
                sdir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)),
                    "agents/{}_es.p".format(ID))
                vector_to_parameters(
                    torch.from_numpy(es.result.xbest).float(),
                    policy.parameters())
                T.save(policy, sdir)
                print("Saved checkpoint, {}".format(sdir))

            print(es.mean.min(), es.mean.max())
            X = es.ask()

            es.tell(X, [f(x) for x in X])
            es.disp()

    except KeyboardInterrupt:
        print("User interrupted process.")

    return es.result.fbest
Ejemplo n.º 16
0
    def optim_value_lbfgs(self, V_target, inds):
        value = self.value
        value.zero_grad()
        loss_fn = self.loss_func_v

        def V_closure():
            predicted = value(self.s[inds],
                              other_data=self.other_data[inds] if
                              self.other_data is not None else None).squeeze()
            loss = loss_fn(predicted, V_target)
            self.value_loss += loss.item()
            optimizer.zero_grad()
            loss.backward()
            return loss

        old_params = parameters_to_vector(value.parameters())
        for lr in self.lr * .5**np.arange(10):
            optimizer = optim.LBFGS(self.value.parameters(), lr=lr)
            optimizer.step(V_closure)
            current_params = parameters_to_vector(value.parameters())
            if any(np.isnan(current_params.data.cpu().numpy())):
                print("LBFGS optimization diverged. Rolling back update...")
                vector_to_parameters(old_params, value.parameters())
            else:
                return
Ejemplo n.º 17
0
    def generate(self):
        if self.candidate_idx == self.pop_size:
            # Tell candidate scores
            self.es.tell(self.candidates, self.candidate_scores)
            self.candidates = self.es.ask(self.pop_size)
            if self.weight_decay > 0:
                self.candidates = [
                    self.decay(c, self.weight_decay) for c in self.candidates
                ]
            self.candidate_scores = []
            self.candidate_idx = 0
            self.es.disp()

        candidate = self.candidates[self.candidate_idx]
        self.candidate_idx += 1
        vector_to_parameters(
            torch.from_numpy(candidate).float(), self.convnet.parameters())
        seed_noise = T.randn(1, self.noise_dim)
        with T.no_grad():
            mat = self.convnet(seed_noise)[0].numpy()
        mat = self.normalize_map(mat)

        mat[0, :] = 255
        mat[:, 0] = 255
        mat[-1, :] = 255
        mat[:, -1] = 255

        cv2.imwrite(self.filename, mat)
Ejemplo n.º 18
0
    def step(self, H, step_size=1, closure=None):

        # literally no idea what this does
        loss = None
        if closure is not None:
            loss = closure()

        # set parameters
        params = [p for p in self.param_groups[0]['params']]
        grads = [p.grad for p in params]

        # convert parameters to a vector
        param_vector = parameters_to_vector(params)
        grad_vector = parameters_to_vector(grads)

        # apply rotation / contract / expansion
        soln, _ = torch.solve(
            grad_vector.unsqueeze(1).unsqueeze(0), H.unsqueeze(0))
        scaled_gradient = soln[0].reshape(-1)

        # add the charactoristic scaling
        scaling = torch.dot(scaled_gradient, soln.reshape(-1))
        scaled_gradient *= step_size * torch.sqrt(self.divergence_limit /
                                                  (scaling + self.epsilon))

        # check that the scaling is ok before updating parameters
        if scaling > 0.:
            # update the gradient weights
            vector_to_parameters(scaled_gradient, grads)

        # now we can perform the update
        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.clone(
                            d_p).detach()
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf

                p.data.add_(-group['lr'], d_p)

        return loss
Ejemplo n.º 19
0
 def init_param(self, init_values: to.Tensor = None, **kwargs):
     # See RNNPolicyBase
     if init_values is None:
         # Initialize the layers using default initialization
         init_param(self.rnn_layers, **kwargs)
         init_param(self.output_layer, **kwargs)
     else:
         cp.vector_to_parameters(init_values, self.parameters())
 def object_loss(self, theta):
     model = copy.deepcopy(self.policy)
     vector_to_parameters(theta, model.parameters())
     imp_fac = self.compute_imp_fac(model=model)
     loss = -(imp_fac *
              self.A).mean() - self.entropy_weight * self.compute_entropy()
     curkl = self.mean_kl_divergence(model=model)
     return loss, curkl
Ejemplo n.º 21
0
 def get_best(self):
     with torch.no_grad():
         vector_to_parameters(
             torch.from_numpy(self.es.result.xbest).float(),
             self.convnet.parameters())
         sol = self.convnet(np.random.randn(
             self.noise_dim)).squeeze(0).numpy()
         return self.normalize_map(sol)
Ejemplo n.º 22
0
    def step(self, closure=None):
        """Performs a single optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            weight_decay = group['weight_decay']
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            # HESSIAN VEC COMPUTATION
            # vectorize all parameters
            grad_vec = parameters_to_vector(group['params'])
            # create noise vector
            noise = torch.normal(means=torch.zeros_like(grad_vec), std=self.noise_factor)
            # compute the product
            grad_product = torch.sum(grad_vec * noise)
            grad_grad = torch.autograd.grad(
                grad_product, group['params'], retain_graph=True
            )
            # h_v_p = hessian_vec_product
            fisher_vec_prod = torch.cat([g.contiguous().view(-1) for g in grad_grad])
            hessian_vec_prod = fisher_vec_prod + (self.cg_damping * noise)

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad
                d_p = p.grad.clone().data

                # REST OF SGD STUFF
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    if nesterov:
                        d_p = d_p.add(momentum, buf)
                    else:
                        d_p = buf
                p.data.add_(-group['lr'], d_p)
            flattened = parameters_to_vector(group['params'])
            flattened.data.add_(group['lr'], hessian_vec_prod.data)
            vector_to_parameters(flattened, group['params'])

        return loss
Ejemplo n.º 23
0
    def optimize(self):
        self.total_steps += self.steps_per_train

        if self.total_steps >= self.learning_start:
            experience_sample = ray.get(
                self.experience_replay.sample.remote(self.batch_size))
            state = torch.cat([
                torch.from_numpy(s.state).cuda().unsqueeze(0)
                for s in experience_sample
            ])
            next_state = torch.cat([
                torch.from_numpy(s.next_state).cuda().unsqueeze(0)
                for s in experience_sample
            ])
            terminal = (torch.tensor([s.terminal for s in experience_sample
                                      ]).cuda().unsqueeze(1))
            reward = (torch.tensor([s.reward for s in experience_sample
                                    ]).cuda().unsqueeze(1))
            action = torch.tensor([s.action for s in experience_sample]).cuda()

            # Train value function
            target = (
                reward + self.gamma * (1 - terminal) * self.target_value_fn(
                    next_state, self.target_policy(next_state))).detach()
            actual = self.online_value_fn(state, action)
            value_fn_loss = self.value_fn_criterion(target, actual)
            value_fn_loss.backward()
            self.value_fn_opt.step()
            self.online_policy.zero_grad()
            self.online_value_fn.zero_grad()

            # Train policy
            policy_loss = -self.online_value_fn(
                state, self.online_policy(state)).mean()
            policy_loss.backward()
            self.policy_opt.step()
            self.online_policy.zero_grad()
            self.online_value_fn.zero_grad()

            # Update target networks
            v_policy = parameters_to_vector(self.online_policy.parameters())
            v_policy_targ = parameters_to_vector(
                self.target_policy.parameters())
            new_v_policy_targ = (self.polyak * v_policy_targ +
                                 (1 - self.polyak) * v_policy)
            vector_to_parameters(new_v_policy_targ,
                                 self.target_policy.parameters())

            v_value_fn = parameters_to_vector(
                self.online_value_fn.parameters())
            v_value_fn_targ = parameters_to_vector(
                self.target_value_fn.parameters())
            new_v_value_fn_targ = (self.polyak * v_value_fn_targ +
                                   (1 - self.polyak) * v_value_fn)
            vector_to_parameters(new_v_value_fn_targ,
                                 self.target_value_fn.parameters())
Ejemplo n.º 24
0
def _pes_sample_one(G, param):
    """ Sample one rollout with the current setting. """
    pol_param, dom_param, init_state = param
    vector_to_parameters(pol_param, G.policy.parameters())

    return rollout(G.env,
                   G.policy,
                   reset_kwargs={
                       'init_state': init_state,
                       'domain_param': dom_param,
                   })
Ejemplo n.º 25
0
    def learn(self, env, max_iter, batch_size):
        for i_iter in xrange(max_iter):
            s = env.reset()
            self._noise_generator.reset()
            done = False
            add_noise = i_iter * 1.0 / max_iter < self.explore_fraction
            e_reward = 0
            while not done:
                # env.render()
                noise = torch.FloatTensor(
                    self._noise_generator.generate()) if add_noise else None
                a = self.act(s, noise=noise)
                s_, r, done, info = env.step(a)
                self._replay_module.add(tuple((s, a, [r], s_, [int(done)])))
                s = s_
                e_reward += r

                if len(self._replay_module) < self.warmup_size:
                    continue
                # sample batch transitions
                b_s, b_a, b_r, b_s_, b_d = self._replay_module.sample(
                    batch_size)
                b_s = numpy.vstack(b_s)
                b_a = numpy.vstack(b_a)
                b_s, b_a, b_r, b_d = map(
                    lambda ryo: Variable(torch.FloatTensor(ryo)),
                    [b_s, b_a, b_r, b_d])
                b_s_ = Variable(torch.FloatTensor(b_s_), volatile=True)

                # update critic
                self._optimizer_critic.zero_grad()
                y = b_r + self.reward_gamma * self._target_critic(
                    b_s_, self._target_actor(b_s_)) * (1 - b_d)
                loss = self.loss(self._critic(b_s, b_a), y)
                loss.backward()
                self._optimizer_critic.step()

                # update actor
                self._optimizer_actor.zero_grad()
                loss = -self._critic(
                    b_s, self._actor(b_s)).mean()  # dpg, eq6 in [1]
                loss.backward()
                self._optimizer_actor.step()

                # update target networks
                for target, normal in [(self._target_actor, self._actor),
                                       (self._target_critic, self._critic)]:
                    target_vec = parameters_to_vector(target.parameters())
                    normal_vec = parameters_to_vector(normal.parameters())
                    vector_to_parameters(
                        (1 - self.tau) * target_vec + self.tau * normal_vec,
                        target.parameters())
            logger.info('Iter: {}, E_Reward: {}'.format(
                i_iter, round(e_reward, 2)))
Ejemplo n.º 26
0
 def surrogate_loss(self, theta):
     """
 Returns the surrogate loss w.r.t. the given parameter vector theta
 """
     new_model = copy.deepcopy(self.policy_model)
     vector_to_parameters(theta, new_model.parameters())
     observations_tensor = self.observations
     prob_new = new_model(observations_tensor).gather(
         1, torch.cat(self.actions)).data
     prob_old = self.policy_model(observations_tensor).gather(
         1, torch.cat(self.actions)).data + 1e-8
     return -torch.mean((prob_new / prob_old) * self.advantage)
    def step(self,
             episodes,
             max_kl=1e-3,
             cg_iters=10,
             cg_damping=1e-2,
             ls_max_steps=10,
             ls_backtrack_ratio=0.5):
        """Meta-optimization step (ie. update of the initial parameters), based 
        on Trust Region Policy Optimization (TRPO, [4]).
        """
        old_loss, _, old_pis = self.surrogate_loss(episodes)
        print('old_loss: ', old_loss)
        # although old_loss is e-8 magnitude, grads is not very small
        grads = torch.autograd.grad(old_loss, self.policy.parameters())
        grads = parameters_to_vector(grads)
        print('grads: ', grads)

        # Compute the step direction with Conjugate Gradient
        hessian_vector_product = self.hessian_vector_product(
            episodes, damping=cg_damping)
        stepdir = conjugate_gradient(hessian_vector_product,
                                     grads,
                                     cg_iters=cg_iters)

        # Compute the Lagrange multiplier
        shs = 0.5 * torch.dot(
            stepdir,
            hessian_vector_product(stepdir))  # dot of 3 matrices, sT.H.s
        lagrange_multiplier = torch.sqrt(shs / max_kl)
        '''? neglect difference of pi and old_pi?'''
        # step is only calculated once with all ratio to be 1.
        step = stepdir / lagrange_multiplier
        print('step: ', step)

        # Save the old parameters
        old_params = parameters_to_vector(self.policy.parameters())

        # Line search
        step_size = 1.0
        for _ in range(ls_max_steps):
            # assign values to policy network parameters
            # step is fixed during line search
            vector_to_parameters(old_params - step_size * step,
                                 self.policy.parameters())
            # print('oldpis: ', old_pis)
            loss, kl, _ = self.surrogate_loss(episodes, old_pis=old_pis)
            improve = loss - old_loss
            if (improve.item() < 0.0) and (kl.item() < max_kl):
                break
            step_size *= ls_backtrack_ratio
        else:
            vector_to_parameters(old_params, self.policy.parameters())
Ejemplo n.º 28
0
    def _line_search(self, old_loss, loss_grad, step_vector_x, advantage_batch,
                     s_batch, old_policy, a_batch):
        old_actor = copy.deepcopy(self.actor)

        actor_flat_params = parameters_to_vector(self.actor.parameters())
        expected_improve = (loss_grad * step_vector_x).sum(0, keepdim=True)
        expected_improve = expected_improve.cpu().numpy()

        i, line_search_succeed = -1, False
        for i in range(self.backtrack_iters):
            # 라인 서치로 정책 업데이트
            backtrack_ratio = self.backtrack_coeff**i
            constraint_params = actor_flat_params + backtrack_ratio * step_vector_x
            vector_to_parameters(constraint_params, self.actor.parameters())

            # 바꾼 actor를 기반으로 다시 평균(log정책(a|s)*A) 구해봄
            meow, logstd, std = self.actor(s_batch)
            new_policy = self._log_density(a_batch, meow, std, logstd)
            constraint_loss = self._surrogate_loss(
                old_policy=old_policy,
                new_policy=new_policy,
                advantage_batch=advantage_batch)
            loss_improve = (constraint_loss - old_loss).detach().cpu().numpy()
            weighted_expected_improve = backtrack_ratio * expected_improve
            kl = kl_divergence(new_actor=self.actor,
                               old_actor=old_actor,
                               s_batch=s_batch)
            kl = kl.mean()

            TrainerMetadata().log(kl, 'KL', 'current_kl', compute_maxmin=True)
            TrainerMetadata().log(self.max_kl, 'KL', 'max_kl')
            TrainerMetadata().log(loss_improve / weighted_expected_improve,
                                  'real / expected (improve)',
                                  'real_ratio',
                                  compute_maxmin=True)
            TrainerMetadata().log(0.5, 'real / expected (improve)',
                                  'threshold ')
            # TrainerMetadata().log(expected_improve, 'expected_improve', compute_maxmin=True)

            # see https://en.wikipedia.org/wiki/Backtracking_line_search
            # TODO: 0.5 인 이유? 1.0 보다 커야 개선된 것 아닌가
            # 일단 Armijo used ​1⁄2 for both c and tau in a paper he published in 1966
            if kl < self.max_kl and (loss_improve /
                                     weighted_expected_improve) > 0.5:
                line_search_succeed = True
                break

        TrainerMetadata().console_log('KL_iter', i)

        if not line_search_succeed:
            self.actor = copy.deepcopy(old_actor)
            print('policy update does not impove the surrogate')
Ejemplo n.º 29
0
    def f(w):
        reward = 0
        done = False
        obs = env.reset()

        vector_to_parameters(torch.from_numpy(w).float(), policy.parameters())

        while not done:
            with torch.no_grad():
                act = policy.sample_action(obs)
            obs, rew, done, _ = env.step(act)
            reward += rew

        return -reward
Ejemplo n.º 30
0
 def step_test(self,
               episodes,
               max_kl=1e-3,
               cg_iters=10,
               cg_damping=1e-2,
               ls_max_steps=10,
               ls_backtrack_ratio=0.5):
     """Meta-optimization step (ie. update of the initial parameters), based
     on Trust Region Policy Optimization (TRPO, [4]).
     """
     grads = self.compute_ng_gradient_test(episodes)
     old_params = parameters_to_vector(self.policy.parameters())
     update_params = self.adam_step(old_params, grads)
     vector_to_parameters(update_params, self.policy.parameters())