Beispiel #1
0
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()
Beispiel #2
0
def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(state)
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.item()
Beispiel #3
0
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs, state_value = model(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.data[0]
Beispiel #4
0
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    state = state.cuda()
    probs = policy(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.data[0]
Beispiel #5
0
 def run(self, x):
     x=Variable(x)
     p=self(x)
     if self.original_output:
         d=Categorical(logits=p)
     else:
         #Suppose after the output_activation, we get the probability(i.e. a softmax activation)
         #This assumption might be false.
         d=Categorical(probs=p)
     action=d.sample()
     log_prob=d.log_prob(action)
     return action, log_prob 
Beispiel #6
0
 def run(self, x):
     x=Variable(Tensor(x))
     p=self(x)
     if self.original_output:
         d=Categorical(logits=p)
     else:
         #Suppose after the output_activation, we get the probability(i.e. a softmax activation)
         #This assumption might be false.
         d=Categorical(probs=p)
     action=d.sample()
     self.history_of_log_probs.append(d.log_prob(action))
     return action #haven't checked the type of action, might be buggy here
Beispiel #7
0
    def ctrl_fn(state):
        state_feats = torch.from_numpy(state.board.flatten()).float().unsqueeze(0)
        probs, value = net(state_feats)
        mask = torch.zeros_like(probs).index_fill_(1, torch.from_numpy(state.valid_actions), 1)
        probs = probs * mask

        if train:
            m = Categorical(probs)
            action = m.sample()

            net.log_prob = m.log_prob(action)
            net.value = value
            return action.item()
        else:
            action = probs.argmax(dim=-1)
            return action.item()
    def __init__(self, n_obs, mu0, sigma0, n_clusters, hidden_dim = 30):

        # dimension of the problem
        self.dim = len(mu0)
        self.n_clusters = n_clusters
        self.n_obs = n_obs

        # prior parameters
        self.mu0 = mu0
        self.sigma0 = torch.Tensor([sigma0])
        # uniform prior on weights
        self.prior_weights = torch.ones(self.n_clusters) / self.n_clusters

        # true parameters
        self.set_true_params()
        self.cat_rv = Categorical(probs = self.prior_weights)

        # the encoder
        # self.gmm_encoder = GMMEncoder(data_dim = self.dim,
        #                      n_classes = self.n_clusters,
        #                      hidden_dim = hidden_dim)
        #
        # self.var_params = {'encoder_params': self.gmm_encoder.parameters()}


        # other variational paramters: we use point masses for
        # the means and variances
        self.set_random_var_params()

        # draw data
        self.n_obs = n_obs
        self.y, self.z = self.draw_data(n_obs = n_obs)
Beispiel #9
0
 def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
     if not isinstance(total_count, Number):
         raise NotImplementedError('inhomogeneous total_count is not supported')
     self.total_count = total_count
     self._categorical = Categorical(probs=probs, logits=logits)
     batch_shape = self._categorical.batch_shape
     event_shape = self._categorical.param_shape[-1:]
     super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args)
    def __init__(self, slen = 68,
                    padding = 14,
                    data_dir = '../mnist_data/',
                    propn_sample = 1.0,
                    indices = None,
                    train_set = True):

        # slen is the side length of the image on which an mnist digit (28 x 28)
        # is placed. Padding is the width of the border of the full image

        super(MovingMNISTDataSet, self).__init__()

        # Load MNIST dataset
        assert os.path.exists(data_dir)

        # This is the full dataset
        self.mnist_data_set = load_mnist_data(data_dir = data_dir, train = train_set)

        if train_set:
            n_image_full = len(self.mnist_data_set.train_labels)
        else:
            n_image_full = len(self.mnist_data_set.test_labels)

        # we may wish to subset
        if indices is None:
            self.num_images = round(n_image_full * propn_sample)
            self.sample_indx = np.random.choice(n_image_full, self.num_images, replace = False)
        else:
            self.num_images = len(indices)
            self.sample_indx = indices

        # set up parameters for moving MNIST

        # original mnist side length
        self.mnist_slen = self.mnist_data_set[0][0].shape[-1]
        # padded side-length
        self.slen = slen
        self.padding = padding
        # number of possible pixel locations
        self.n_pixel_1d = (slen -  2 * padding) ** 2
        # define uniform categorical variable over pixels
        unif_probs = torch.ones(self.n_pixel_1d) / self.n_pixel_1d
        unif_probs = unif_probs.view(-1, self.n_pixel_1d)
        self.categorical = Categorical(unif_probs)
        # for padding the image, we cache this grid
        r0 = (slen - 1) / 2
        self.grid_out = \
            torch.FloatTensor(np.mgrid[0:slen, 0:slen].transpose() - r0)
    def forward(self, jobs, machines, allocable_jobs=None, allocable_machines=None, argmax=False):
        job_input_size = jobs.size(0)
        machine_input_size = machines.size(0)

        E_j, E_m = self.get_embedding(jobs, machines)
        g_j1 = self.get_job_attention(self.last_j, E_j)
        g_m1 = self.get_node_attention(self.last_j, E_m)
        E_j = torch.cat([E_j, self.no_select_job.unsqueeze(0)], dim=0)
        E_m = torch.cat([E_m, self.no_select_machine.unsqueeze(0)], dim=0)

        g_1 = torch.cat([self.last_j, g_j1, g_m1])
        j_logits = self.j_att(g_1, E_j)

        ### selecting processes
        if allocable_jobs is not None:
            x = []
            for _ in range(job_input_size):
                if _ not in allocable_jobs:
                    x.append(_)
            if len(x) > 0:
                mask = torch.from_numpy(np.array(x, dtype=int))

                j_logits[mask] = -1e8

        job_softmax = torch.softmax(j_logits, 0)
        job_sampler = Categorical(job_softmax)

        if argmax:
            selected_job = torch.argmax(job_softmax)
        else:
            try:
                selected_job = job_sampler.sample()
            except:
                raise UnboundLocalError;

        if selected_job == job_input_size:
            return (-1, -1), job_sampler.log_prob(selected_job)

        as_i = int(selected_job.detach().numpy())

        e_js = E_j[selected_job]
        g_j2 = self.get_job_attention(e_js, E_j)
        g_m2 = self.get_node_attention(e_js, E_m)
        g_2 = torch.cat([e_js, g_j2, g_m2])
        m_logits = self.m_att(g_2, E_m)
        self.last_j = e_js.detach()
        ### selecting process
        if allocable_machines is not None:
            x = []
            for _ in range(machine_input_size):
                if _ not in allocable_machines[as_i]:
                    x.append(_)
            mask = torch.from_numpy(np.array(x, dtype=int))
            m_logits[mask] = -1e8

        machine_softmax = torch.softmax(m_logits, 0)
        machine_sampler = Categorical(machine_softmax)
        if argmax:
            selected_machine = torch.argmax(machine_softmax, -1)
        else:
            selected_machine = machine_sampler.sample()


        logpas = machine_sampler.log_prob(selected_machine) + job_sampler.log_prob(selected_job)
        if selected_machine == machine_input_size:
            return (-1, -1), logpas
        return (int(selected_job.detach().numpy()), int(selected_machine.detach().numpy())), logpas
 def get_action(self, state):
     state = u.t_from_np_to_float32(state)
     probs = self.actor(state)
     return Categorical(probs).sample().item()
def learn_multi(model, update_timestep, env, max_reward=-2):
    log_interval = update_timestep
    total_correct_moves=0
    correct_moves = 0
    my_rewards    = [0, 0, 0, 0] # used for correct reward adding (if round is finished)
    max_reward    = 1
    jjj           = 0
    for i_episode in range(100000000):
        h_out = (torch.zeros([1, 1, 32], dtype=torch.float), torch.zeros([1, 1, 32], dtype=torch.float))
        s = env.reset()
        done = False
        while not done:
            for t in range(T_horizon):
                i      = env.my_game.active_player
                h_in = h_out
                prob, h_out = model[i].pi(torch.from_numpy(s).float(), h_in)
                prob = prob.view(-1)
                m = Categorical(prob)
                a = m.sample().item()
                s_prime, r, done, info = env.step(a)
                if r["ai_reward"] is None: # illegal move
                    rr=-1
                else:#shift round ->0 or leagal play move
                    rr=0
                model[i].put_data((s, a, rr, s_prime, prob[a].item(), h_in, h_out, done))
                if info["round_finished"] and r["state"] == "play" and int(r["ai_reward"]) is not None:
                    for u in range(4):
                        last_transition= model[u].data
                        if len(model[u].data)>1:
                            last_transition= model[u].data[:-1]
                        last_transition_list = list(last_transition[0])
                        last_transition_list[2] =  (int(r["final_rewards"][u])+60)/40
                        last_transition[0] = last_transition_list
                        model[u].data[:-1] = last_transition

                    # win_player = r["player_win_idx"]
                    # last_transition= model[win_player].data
                    # if len(model[win_player].data)>1:
                    #     last_transition= model[win_player].data[:-1]
                    # last_transition_list = list(last_transition[0])
                    # last_transition_list[2] =  int(r["final_rewards"][win_player])
                    # last_transition[0] = last_transition_list
                    # model[win_player].data[:-1] = last_transition

                s = s_prime

                if done:
                    break
            #for lll in range(4):
            model[i].train_net()
        total_correct_moves +=info["correct_moves"]

        if i_episode % log_interval == 0:
            jjj +=1
            total_correct_moves = total_correct_moves/log_interval
            corr_moves, mean_reward, finished_games =  test_with_random(model[0], env, jjj)
            #test play against random
            aaa = ('Game ,{:07d}, reward per game in {} g. ,{:0.5}, corr_moves ,{:4.4},  Time ,{},\n'.format(i_episode, finished_games, float(mean_reward), float(corr_moves), datetime.datetime.now()-start_time))
            print(aaa)
            #max correct moves: 61
            if mean_reward>max_reward and corr_moves>2.0:
                 path =  'PPO_{}_{}_{}'.format(i_episode, finished_games, mean_reward)
                 torch.save(model[0].state_dict(), path+".pth")
                 max_reward = mean_reward
                 print("exported path \n")

            total_correct_moves = 0
            with open(log_path, "a") as myfile:
                myfile.write(aaa)
def sample_class_weights(class_weights):
    # draw a sample from Categorical variable with
    # probabilities class_weights

    cat_rv = Categorical(probs = class_weights)
    return cat_rv.sample().detach()
Beispiel #15
0
def lfl(n_run, tmax, kmax, trajectories=None, computeTrajectories=False):
    # set hyperparameters
    gride_size = 5
    n_states = gride_size**2
    n_actions = 4
    mu = np.zeros(n_states)
    mu[0] = 1
    gamma = 0.96
    alpha = 0.3
    alpha_model = 0.7
    entropy_coef = 0.01
    n_epoch = 10

    # generate a deterministic gridworld:
    g = Grid(gride_size, stochastic=False)

    # we just need the reward and dynamic of the MDP:
    r_gpomdp, p_gpomdp, _ = g.make_tables_gpomdp()
    r, p = g.make_tables()

    learner_score = []
    observer_score = []
    weights = []
    trajectories_spi = []
    for run in range(n_run):
        print('run', run)
        torch.manual_seed(run)

        # init first policy
        pi = np.ones((n_states, n_actions)) / n_actions

        # sample initial trajectory:
        if trajectories is None:
            np.random.seed(run)
            trajectory = sample_sa_trajectory(p, pi, tmax)
        else:
            trajectory = trajectories[run, 0]
            print(trajectory.shape)
            trajectory = trajectory.tolist()

        # transition estimation:
        p_ = np.ones((n_states, n_actions, n_states)) * 1e-15
        count = np.ones((n_states, n_actions, n_states)) * n_states * 1e-15
        for (s, a), (s_, _) in zip(trajectory[:-1], trajectory[1:]):
            p_[int(s), int(a), int(s_)] += 1
            count[int(s), int(a), :] += 1

        p_ /= count

        demos = [trajectory]
        policies = [torch.Tensor(pi)]

        # policy iterations
        for k in range(kmax):
            if trajectories is None:
                q = np.random.rand(n_states, n_actions)
                for _ in range(100):
                    v = np.zeros(n_states)
                    for state in range(n_states):
                        for action_ in range(n_actions):
                            v[state] += pi[state, action_] * \
                                        (q[state, action_] - alpha * np.log(pi[state, action_]))

                    q *= 0
                    for state in range(n_states):
                        for action in range(n_actions):
                            q[state, action] = r[state, action]
                            for state_ in range(n_states):
                                q[state,
                                  action] += gamma * p[state, action,
                                                       state_] * v[state_]

                pi = np.zeros((n_states, n_actions))
                for state in range(n_states):
                    pi[state, :] = softmax(q[state, :] / alpha)

                # sample trajectory with new policy:
                trajectory = sample_sa_trajectory(p, pi, tmax)
            else:
                trajectory = trajectories[run, k + 1]
                trajectory = trajectory.tolist()

            demos.append(trajectory)
            policies.append(torch.Tensor(pi))

        if not computeTrajectories:
            # learner  score
            mdp_to_evaluate = MDP(n_states, n_actions, p_gpomdp, r_gpomdp, mu,
                                  gamma)
            j_pi_learner = mdp_to_evaluate.policy_evaluation(pi.T)[0]
            learner_score.append(j_pi_learner)

            # estimate learner policies
            torch_p = torch.from_numpy(p_).float()
            logpi_ = tuple(nn.Parameter(torch.rand(n_states, n_actions, \
                                                   requires_grad=True)) \
                           for _ in range(kmax + 1))
            optimizer_pi = torch.optim.Adam(logpi_, lr=5e-1)
            for epoch in range(n_epoch):
                loss_pi = 0
                for k, demo in enumerate(demos):
                    demo_sas = [(s, a, s_)
                                for (s, a), (s_,
                                             _) in zip(demo[:-1], demo[1:])]
                    for s, a, s_ in demo_sas:
                        dist = Categorical(torch.exp(logpi_[k][int(s), :]))
                        log_prob_demo = torch.log(dist.probs[int(a)])
                        loss_pi -= (log_prob_demo +
                                    entropy_coef * dist.entropy())

                optimizer_pi.zero_grad()
                loss_pi.backward()
                optimizer_pi.step()

            # create target reward functions:
            targets = []
            for k, demo in enumerate(demos[:-1]):
                dist_2 = torch.exp(logpi_[k + 1]) \
                         / torch.exp(logpi_[k + 1]).sum(1, keepdim=True)
                dist_1 = torch.exp(logpi_[k]) / torch.exp(logpi_[k]).sum(
                    1, keepdim=True)
                kl = torch.log(dist_2) - torch.log(dist_1)
                r_shape = torch.zeros(n_states, n_actions)
                for state in range(n_states):
                    for action in range(n_actions):
                        r_shape[state, action] = alpha_model \
                                                 * torch.log(dist_2[state, action])
                        for state_ in range(n_states):
                            for action_ in range(n_actions):
                                r_shape[state, action] -= alpha_model * gamma \
                                                          * (kl[state_, action_]) * torch_p[state, action, state_] \
                                                          * dist_1[state_, action_]

                targets.append(r_shape)

            # recover state-action reward and shaping
            r_ = nn.Parameter(
                torch.zeros(n_states, n_actions, requires_grad=True))
            r_sh = (r_,) + tuple(nn.Parameter(torch.zeros(n_states, requires_grad=True)) \
                                 for _ in range(kmax))
            optimizer = torch.optim.Adam(r_sh, lr=1)
            for epoch in range(200):
                loss = 0
                for k, target in enumerate(targets):
                    loss += \
                        ((r_sh[0] + r_sh[k + 1].repeat(n_actions, 1).t() - gamma * \
                          torch.sum(torch_p * r_sh[k + 1].repeat(n_states, n_actions, 1), 2) \
                          - target.detach()) ** 2).sum()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            r_ = r_.detach().numpy()

            # solve with r_:
            mdp = MDP(n_states, n_actions, p_gpomdp, r_.T, mu, gamma)
            pi_observer = mdp.get_best_policy()

            # observer score with true reward:
            mdp_to_evaluate = MDP(n_states, n_actions, p_gpomdp, r_gpomdp, mu,
                                  gamma)
            j_pi_observer = mdp_to_evaluate.policy_evaluation(pi_observer)[0]

            observer_score.append(j_pi_observer)
            weights.append(r_)
        else:
            trajectories_spi.append(demos)
    np.save(
        '../results/comparison_learn/lfl_SPI-v3/lfl_svi_' + '' + str(kmax + 1),
        observer_score)
    np.save(
        '../results/comparison_learn/lfl_SPI-v3/weights_svi_' + '' +
        str(kmax + 1), weights)
Beispiel #16
0
 def act(self, obs):
     with torch.no_grad():
         prob = self.policy_net(obs)
         m = Categorical(prob)
     return m.sample().item()
Beispiel #17
0
def local_train(process, global_model, optimizer):
    env = CreateBreakout()
    local_model = ActorCriticNet()
    local_model.load_state_dict(global_model.state_dict())

    total_reward = 0
    max_score = 0

    for T in range(max_T):
        state = env.reset()
        done = False
        score = 0

        while not done:
            log_probs, values, entropys, rewards = [], [], [], []
            for t in range(max_t):
                prob, value = local_model(torch.FloatTensor([state]))

                m = Categorical(prob)
                action = m.sample()
                log_prob = m.log_prob(action)
                entropy = m.entropy()

                next_state, reward, done, _ = env.step(action.item())
                score += reward

                log_probs.append(log_prob)
                values.append(value)
                entropys.append(entropy)
                rewards.append(reward)

                state = next_state
                if done:
                    break

            state_final = torch.FloatTensor([next_state])

            R = 0.0
            if not done:
                _, R = local_model(state_final)
                R = R.item()

            td_target_lst = []
            for reward in rewards[::-1]:
                R = reward + R * gamma
                td_target_lst.append([R])
            td_target_lst.reverse()

            log_probs = torch.stack(log_probs)
            values = torch.cat(values)
            entropys = torch.stack(entropys)
            td_targets = torch.FloatTensor(td_target_lst)
            advantages = (td_targets - values).detach()

            actor_loss = -torch.mean(log_probs * advantages)
            critic_loss = F.smooth_l1_loss(values, td_targets.detach())
            entropy_loss = torch.mean(entropys)

            total_loss = actor_loss + critic_loss - beta * entropy_loss

            optimizer.zero_grad()
            local_model.zero_grad()

            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(local_model.parameters(), 5)

            for local_param, global_param in zip(local_model.parameters(),
                                                 global_model.parameters()):
                if global_param.grad is not None:
                    break
                global_param._grad = local_param.grad

            optimizer.step()
            local_model.load_state_dict(global_model.state_dict())

        total_reward += score
        if score > max_score:
            max_score = score

        if (T + 1) % 10 == 0:
            print('Process {} of episode {}, avg score : {}, max score : {}'.
                  format(process, T + 1, total_reward / 10, max_score))
            total_reward = 0

    env.close()
def select_action(state):
    action_prb = teacher_model(state.detach())
    m = Categorical(action_prb)
    action = m.sample()
    teacher_model.saved_log_probs.append(m.log_prob(action))
    return action
Beispiel #19
0
class TwoPlayerGANModel(BaseModel):
    @staticmethod
    def modify_commandline_options(parser, is_train=True):
        """Add new model-specific options and rewrite default values for existing options.

        Parameters:
            parser -- the option parser
            is_train -- if it is training phase or test phase. You can use this flag to add training-specific or test-specific options.

        Returns:
            the modified parser.
        """
        #parser.set_defaults(dataset_mode='aligned')  # You can rewrite default values for this model. For example, this model usually uses aligned dataset as its dataset
        if is_train:
            parser.add_argument(
                '--g_loss_mode',
                type=str,
                default='lsgan',
                help='lsgan | nsgan | vanilla | wgan | hinge | rsgan')
            parser.add_argument(
                '--d_loss_mode',
                type=str,
                default='lsgan',
                help='lsgan | nsgan | vanilla | wgan | hinge | rsgan')
            parser.add_argument('--which_D',
                                type=str,
                                default='S',
                                help='Standard(S) | Relativistic_average (Ra)')

        return parser

    def __init__(self, opt):
        """Initialize this model class.

        Parameters:
            opt -- training/test options

        A few things can be done here.
        - (required) call the initialization function of BaseModel
        - define loss function, visualization images, model names, and optimizers
        """
        BaseModel.__init__(self,
                           opt)  # call the initialization method of BaseModel

        self.opt = opt
        if opt.d_loss_mode == 'wgan' and not opt.use_gp:
            raise NotImplementedError(
                'using wgan on D must be with use_gp = True.')

        self.loss_names = [
            'G_real', 'G_fake', 'D_real', 'D_fake', 'D_gp', 'G', 'D'
        ]
        self.visual_names = ['real_visual', 'gen_visual']

        if self.isTrain:  # only defined during training time
            self.model_names = ['G', 'D']
        else:
            self.model_names = ['G']

        if self.opt.cgan:
            probs = np.ones(self.opt.cat_num) / self.opt.cat_num
            self.CatDis = Categorical(torch.tensor(probs))
        # define networks
        self.netG = networks.define_G(opt.z_dim, opt.output_nc, opt.ngf,
                                      opt.netG, opt.g_norm, opt.cgan,
                                      opt.cat_num, not opt.no_dropout,
                                      opt.init_type, opt.init_gain,
                                      self.gpu_ids)

        if self.isTrain:  # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
            self.netD = networks.define_D(opt.input_nc, opt.ndf, opt.netD,
                                          opt.d_norm, opt.cgan, opt.cat_num,
                                          opt.init_type, opt.init_gain,
                                          self.gpu_ids)

        if self.isTrain:  # only defined during training time
            # define loss functions
            self.criterionG = networks.GANLoss(opt.g_loss_mode, 'G',
                                               opt.which_D).to(self.device)
            self.criterionD = networks.GANLoss(opt.d_loss_mode, 'D',
                                               opt.which_D).to(self.device)
            # initialize optimizers
            self.optimizer_G = torch.optim.Adam(self.netG.parameters(),
                                                lr=opt.lr_g,
                                                betas=(opt.beta1, opt.beta2))
            self.optimizer_D = torch.optim.Adam(self.netD.parameters(),
                                                lr=opt.lr_d,
                                                betas=(opt.beta1, opt.beta2))
            self.optimizers.append(self.optimizer_G)
            self.optimizers.append(self.optimizer_D)
        # visulize settings
        self.N = int(np.trunc(np.sqrt(min(opt.batch_size, 64))))
        if self.opt.z_type == 'Gaussian':
            self.z_fixed = torch.randn(self.N * self.N,
                                       opt.z_dim,
                                       1,
                                       1,
                                       device=self.device)
        elif self.opt.z_type == 'Uniform':
            self.z_fixed = torch.rand(
                self.N * self.N, opt.z_dim, 1, 1, device=self.device) * 2. - 1.
        if self.opt.cgan:
            yf = self.CatDis.sample([self.N * self.N])
            self.y_fixed = one_hot(yf, [self.N * self.N, self.opt.cat_num])

    def set_input(self, input):
        """input: a dictionary that contains the data itself and its metadata information."""
        self.input_imgs = input['image'].to(self.device)
        if self.opt.cgan:
            self.input_targets = input['target'].to(self.device)

    def forward(self, batch_size=None):
        bs = self.opt.batch_size if batch_size is None else batch_size
        if self.opt.z_type == 'Gaussian':
            z = torch.randn(bs, self.opt.z_dim, 1, 1, device=self.device)
        elif self.opt.z_type == 'Uniform':
            z = torch.rand(bs, self.opt.z_dim, 1, 1,
                           device=self.device) * 2. - 1.

        if not self.opt.cgan:
            self.gen_imgs = self.netG(z)
        else:
            y = self.CatDis.sample([bs])
            self.y_ = one_hot(y, [bs, self.opt.cat_num])
            self.gen_imgs = self.netG(z, self.y_)

    def backward_G(self):
        # pass D
        if not self.opt.cgan:
            self.fake_out = self.netD(self.gen_imgs)
            self.real_out = self.netD(self.real_imgs)
        else:
            self.fake_out = self.netD(self.gen_imgs, self.y_)
            self.real_out = self.netD(self.real_imgs, self.targets)

        self.loss_G_fake, self.loss_G_real = self.criterionG(
            self.fake_out, self.real_out)
        self.loss_G = self.loss_G_fake + self.loss_G_real
        self.loss_G.backward()

    def backward_D(self):
        self.gen_imgs = self.gen_imgs.detach()
        # pass D
        if not self.opt.cgan:
            self.fake_out = self.netD(self.gen_imgs)
            self.real_out = self.netD(self.real_imgs)
        else:
            self.fake_out = self.netD(self.gen_imgs, self.y_)
            self.real_out = self.netD(self.real_imgs, self.targets)

        self.loss_D_fake, self.loss_D_real = self.criterionD(
            self.fake_out, self.real_out)
        if self.opt.use_gp is True:
            self.loss_D_gp = networks.cal_gradient_penalty(self.netD,
                                                           self.real_imgs,
                                                           self.gen_imgs,
                                                           self.device,
                                                           type='mixed',
                                                           constant=1.0,
                                                           lambda_gp=10.0)[0]
        else:
            self.loss_D_gp = 0.

        self.loss_D = self.loss_D_fake + self.loss_D_real + self.loss_D_gp
        self.loss_D.backward()

    def optimize_parameters(self):
        for i in range(self.opt.D_iters + 1):
            self.real_imgs = self.input_imgs[i * self.opt.batch_size:(i + 1) *
                                             self.opt.batch_size, :, :, :]
            if self.opt.cgan:
                self.targets = self.input_target[i *
                                                 self.opt.batch_size:(i + 1) *
                                                 self.opt.batch_size, :]
            self.forward()
            # update G
            if i == 0:
                self.set_requires_grad(self.netD, False)
                self.optimizer_G.zero_grad()
                self.backward_G()
                self.optimizer_G.step()
            # update D
            else:
                self.set_requires_grad(self.netD, True)
                self.optimizer_D.zero_grad()
                self.backward_D()
                self.optimizer_D.step()
Beispiel #20
0
    def __init__(self, opt):
        """Initialize this model class.

        Parameters:
            opt -- training/test options

        A few things can be done here.
        - (required) call the initialization function of BaseModel
        - define loss function, visualization images, model names, and optimizers
        """
        BaseModel.__init__(self,
                           opt)  # call the initialization method of BaseModel

        self.opt = opt
        if opt.d_loss_mode == 'wgan' and not opt.use_gp:
            raise NotImplementedError(
                'using wgan on D must be with use_gp = True.')

        self.loss_names = [
            'G_real', 'G_fake', 'D_real', 'D_fake', 'D_gp', 'G', 'D'
        ]
        self.visual_names = ['real_visual', 'gen_visual']

        if self.isTrain:  # only defined during training time
            self.model_names = ['G', 'D']
        else:
            self.model_names = ['G']

        if self.opt.cgan:
            probs = np.ones(self.opt.cat_num) / self.opt.cat_num
            self.CatDis = Categorical(torch.tensor(probs))
        # define networks
        self.netG = networks.define_G(opt.z_dim, opt.output_nc, opt.ngf,
                                      opt.netG, opt.g_norm, opt.cgan,
                                      opt.cat_num, not opt.no_dropout,
                                      opt.init_type, opt.init_gain,
                                      self.gpu_ids)

        if self.isTrain:  # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc
            self.netD = networks.define_D(opt.input_nc, opt.ndf, opt.netD,
                                          opt.d_norm, opt.cgan, opt.cat_num,
                                          opt.init_type, opt.init_gain,
                                          self.gpu_ids)

        if self.isTrain:  # only defined during training time
            # define loss functions
            self.criterionG = networks.GANLoss(opt.g_loss_mode, 'G',
                                               opt.which_D).to(self.device)
            self.criterionD = networks.GANLoss(opt.d_loss_mode, 'D',
                                               opt.which_D).to(self.device)
            # initialize optimizers
            self.optimizer_G = torch.optim.Adam(self.netG.parameters(),
                                                lr=opt.lr_g,
                                                betas=(opt.beta1, opt.beta2))
            self.optimizer_D = torch.optim.Adam(self.netD.parameters(),
                                                lr=opt.lr_d,
                                                betas=(opt.beta1, opt.beta2))
            self.optimizers.append(self.optimizer_G)
            self.optimizers.append(self.optimizer_D)
        # visulize settings
        self.N = int(np.trunc(np.sqrt(min(opt.batch_size, 64))))
        if self.opt.z_type == 'Gaussian':
            self.z_fixed = torch.randn(self.N * self.N,
                                       opt.z_dim,
                                       1,
                                       1,
                                       device=self.device)
        elif self.opt.z_type == 'Uniform':
            self.z_fixed = torch.rand(
                self.N * self.N, opt.z_dim, 1, 1, device=self.device) * 2. - 1.
        if self.opt.cgan:
            yf = self.CatDis.sample([self.N * self.N])
            self.y_fixed = one_hot(yf, [self.N * self.N, self.opt.cat_num])
Beispiel #21
0
 def logprob(self, datas, value_data):
     distribution = Categorical(datas)
     return distribution.log_prob(value_data).float().to(device)      
Beispiel #22
0
 def entropy(self, datas):
     distribution = Categorical(datas)            
     return distribution.entropy().float().to(device)
Beispiel #23
0
 def sample(self, datas):
     distribution = Categorical(datas)      
     return distribution.sample().float().to(device)
Beispiel #24
0
def train_eos(train_data, enc, eos, ldis, rnn=True, device='cpu'):
    enc, _ = enc
    eos, eos_optim = eos
    ldis, dis_optim = ldis

    for data, _, _ in train_data:
        temporal_output = []
        log_prob = []
        past_actions = []
        eos_states = [torch.zeros([1, 1, h], device=device)
                      for h in eos.hidden_sizes]
        if rnn:
            enc_states = [torch.zeros([1, 1, h], device=device)
                          for h in enc.hidden_sizes]
            data = data.unsqueeze(1)
            for d in data:
                d = d.view(1, 1, -1)
                encoded, enc_states = enc(d, enc_states)
                h, eos_states = eos(encoded, eos_states)

                softmax_output = F.softmax(h, -1)
                dist = Categorical(softmax_output)
                action_taken = dist.sample()
                log_prob.append(dist.log_prob(action_taken).squeeze())
                temporal_output.append(encoded.squeeze())
                past_actions.append(action_taken)

        else:
            encoded = enc(data)
            for e in encoded:
                e = e.view(1, 1, -1)
                h, enc_states = eos(e, eos_states)
                softmax_output = F.softmax(h, -1)
                dist = Categorical(softmax_output)
                action_taken = dist.sample()
                log_prob.append(dist.log_prob(action_taken).squeeze())
                temporal_output.append(e.squeeze())
                past_actions.append(action_taken)
        shuffled_output = shuffle(past_actions, temporal_output).unsqueeze_(1)

        if rnn:
            enc_states = [torch.zeros([1, 1, h], device=device)
                          for h in enc.hidden_sizes]
            encoded, _ = enc(data, enc_states)
        else:
            encoded = enc(data)
            encoded = encoded.unsqueeze_(1)
        dis_states = [torch.zeros([1, 1, h], device=device)
                      for h in ldis.hidden_sizes]
        F_output, _ = ldis(shuffled_output, dis_states.copy())
        T_output, _ = ldis(encoded, dis_states.copy())

        loss = F.binary_cross_entropy(torch.sigmoid(F_output[-1]).sum(), torch.tensor(0., device=device)) + \
            F.binary_cross_entropy(
                torch.sigmoid(T_output[-1]).sum(), torch.tensor(0., device=device))
        dis_optim.zero_grad()
        loss.backward(retain_graph=True)
        dis_optim.step()

        loss = torch.tensor(0., device=device)
        for log_p in log_prob:
            loss -= log_p * torch.sigmoid(F_output[-1]).sum().item()
        eos_optim.zero_grad()
        loss.backward()
        eos_optim.step()
Beispiel #25
0
 def choose_action(self, state):
     state = torch.unsqueeze(torch.FloatTensor(state), 0)
     probs = self.policy(state)
     c = Categorical(probs)
     action = c.sample()
     return int(action.data.numpy())
Beispiel #26
0
 def get_action_probabilities(self, observation, temperature=1):
     with torch.no_grad():
         return Categorical(logits=self.forward(observation) * temperature)
Beispiel #27
0
def active_learning_taylor(func_name,
                           start_rand_idxs=None,
                           bud=None,
                           valid=True,
                           fac_loc_idx=None):

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    torch.manual_seed(42)
    np.random.seed(42)
    model = ResNet18(num_cls)
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    idxs = start_rand_idxs

    criterion = nn.CrossEntropyLoss()
    criterion_nored = nn.CrossEntropyLoss(reduction='none')
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    if func_name == 'Facility Location':
        idxs = run_stochastic_Facloc(x_trn, y_trn, bud)
        facility_loaction_warm_start = copy.deepcopy(idxs)

    remainList = set([i for i in range(N)])
    idxs = list(idxs)
    remainList = remainList.difference(idxs)

    subset_trnloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=trn_batch_size,
        shuffle=False,
        sampler=SubsetRandomSampler(idxs),
        pin_memory=True)

    if func_name == 'Taylor Online':
        print("Starting Online OneStep Run with taylor on loss!")
    elif func_name == 'Full OneStep':
        print("Starting Online OneStep Run without taylor!")
    elif func_name == 'Facloc Regularized':
        print(
            "Starting Facility Location Regularized Online OneStep Run with taylor!"
        )
    elif func_name == 'Random Greedy':
        print("Starting Randomized Greedy Online OneStep Run with taylor!")
    elif func_name == 'Facility Location':
        print("Starting Facility Location!")
    elif func_name == 'Random':
        print("Starting Random Run!")
    elif func_name == 'Random Perturbation':
        print(
            "Starting Online OneStep Run with taylor with random perturbation!"
        )
    elif func_name == "FASS":
        print("Filtered Active Submodular Selection(FASS)!")
    #elif func_name == 'Proximal':
    #print("Starting Online Proximal OneStep Run with taylor!")
    #elif func_name == 'Taylor on Logit':
    #    print("Starting Online OneStep Run with taylor on logit!")

    # if valid:
    #     print("Online OneStep Run with Taylor approximation and with Validation Set",file=logfile)
    # else:
    #     print("Online OneStep Run with Taylor approximation and without Validation Set",file=logfile)

    val_accies = np.zeros(no_select)
    test_accies = np.zeros(no_select)
    unlab_accies = np.zeros(no_select)

    # idxs = start_rand_idxs

    def weight_reset(m):
        torch.manual_seed(42)
        torch.cuda.manual_seed(42)
        np.random.seed(42)
        random.seed(42)
        torch.backends.cudnn.deterministic = True

        if isinstance(m, nn.Linear):
            #m.reset_parameters()
            m.weight.data.normal_(0.0, 0.02)
            m.bias.data.fill_(0)
        elif isinstance(m, nn.Conv2d):
            nn.init.xavier_uniform(m.weight.data)
            if m.bias is not None:
                nn.init.xavier_uniform(m.bias.data)

    fn = nn.Softmax(dim=1)
    for n in range(no_select):

        model.train()
        for i in range(num_epochs):

            accFinal = 0.
            for batch_idx, (inputs, targets) in enumerate(subset_trnloader):
                # targets can have non_blocking=True.
                x, y = inputs.to(device), targets.to(device, non_blocking=True)
                #x, y = Variable(x.cuda()), Variable(y.cuda())
                optimizer.zero_grad()
                out = model(x)
                loss = F.cross_entropy(out, y)
                accFinal += torch.sum(
                    (torch.max(out, 1)[1] == y).float()).data.item()
                loss.backward()

                if (i % 50
                        == 0) and (accFinal < 0.2):  # reset if not converging
                    model = model.apply(weight_reset).cuda()
                    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

                # clamp gradients, just in case
                for p in filter(lambda p: p.grad is not None,
                                model.parameters()):
                    p.grad.data.clamp_(min=-.1, max=.1)

                optimizer.step()

        print(n + 1, 'Time', 'SubsetTrn', loss.item()
              )  #, ,FullTrn,ValLoss: full_trn_loss.item(), val_loss.item())

        curr_X_trn = x_trn[list(remainList)]
        #curr_Y_trn = y_trn[list(remainList)]

        model.eval()
        with torch.no_grad():
            '''full_trn_out = model(x_trn)
            full_trn_loss = criterion(full_trn_out, y_trn).mean()
            sub_trn_out = model(x_trn[idxs])
            sub_trn_loss = criterion(sub_trn_out, y_trn[idxs]).mean()'''

            correct = 0
            total = 0
            for batch_idx, (inputs, targets) in enumerate(valloader):
                inputs, targets = inputs.to(device), targets.to(
                    device, non_blocking=True)
                outputs = model(inputs)
                _, val_predict = outputs.max(1)
                correct += val_predict.eq(targets).sum().item()
                total += targets.size(0)
            val_acc = 100 * correct / total

            correct = 0
            total = 0
            for batch_idx, (inputs, targets) in enumerate(testloader):
                inputs, targets = inputs.to(device), targets.to(
                    device, non_blocking=True)
                outputs = model(inputs)
                _, tst_predict = outputs.max(1)
                correct += tst_predict.eq(targets).sum().item()
                total += targets.size(0)
            tst_acc = 100.0 * correct / total

            remloader = torch.utils.data.DataLoader(
                trainset,
                batch_size=trn_batch_size,
                shuffle=False,
                sampler=SubsetRandomSampler(list(remainList)),
                pin_memory=True)

            correct = 0
            total = 0
            cnt = 0
            predictions = []
            for batch_idx, (inputs, targets) in enumerate(remloader):
                inputs, targets = inputs.to(device), targets.to(
                    device, non_blocking=True)
                outputs = model(inputs)
                predictions.append(outputs)
                _, rem_predict = outputs.max(1)
                if cnt == 0:
                    y_rem_trn = rem_predict
                    cnt = cnt + 1
                else:
                    y_rem_trn = torch.cat([y_rem_trn, rem_predict], dim=0)

                correct += rem_predict.eq(targets).sum().item()
                total += targets.size(0)
            rem_acc = 100 * correct / total

        val_accies[n] = val_acc
        test_accies[n] = tst_acc
        unlab_accies[n] = rem_acc

        #if ((i + 1) % select_every == 0) and func_name not in ['Facility Location','Random']:
        # val_in, val_t = x_val.to(device), y_val.to(device)  # Transfer them to device
        cached_state_dict = copy.deepcopy(model.state_dict())
        clone_dict = copy.deepcopy(model.state_dict())
        # Dont put the logs for Selection on logfile!!
        # print("With Taylor approximation",file=logfile)
        # print("selEpoch: %d, Starting Selection:" % i, str(datetime.datetime.now()),file=logfile)
        #t_ng_start = time.time()

        if func_name == 'Random Greedy':
            new_idxs = setf_model.naive_greedy_max(curr_X_trn, y_rem_trn,
                                                   int(0.9 * no_points),
                                                   clone_dict)
            new_idxs = list(np.array(list(remainList))[new_idxs])

            remainList = remainList.difference(new_idxs)
            new_idxs.extend(
                list(
                    np.random.choice(list(remainList),
                                     size=int(0.1 * no_points),
                                     replace=False)))
            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs)

        elif func_name == "FASS":

            cnt = 0
            for pre in predictions:
                soft = fn(pre)
                if cnt == 0:
                    entropy2 = Categorical(probs=soft).entropy()
                    cnt = cnt + 1
                else:
                    entropy2 = torch.cat(
                        [entropy2, Categorical(probs=soft).entropy()], dim=0)

            #print(entropy2.shape)
            if 5 * no_points < entropy2.shape[0]:
                values, indices = entropy2.topk(5 * no_points)
                #indices = list(np.array(list(remainList))[indices.cpu()])
            else:
                indices = [i for i in range(entropy2.shape[0])
                           ]  #list(remainList)

            knn_idxs_flag_val = perform_knnsb_selection(datadir,
                                                        data_name,
                                                        curr_X_trn[indices],
                                                        y_rem_trn[indices],
                                                        fraction,
                                                        selUsing='val')
            #print(knn_idxs_flag_val)
            #print(len(knn_idxs_flag_val))

            ##print(len(knn_idxs_flag_val),len(indices))
            knn_idxs_flag_val = list(
                np.array(list(remainList))[indices.cpu()][knn_idxs_flag_val])

            remainList = remainList.difference(knn_idxs_flag_val)
            idxs.extend(knn_idxs_flag_val)

        elif func_name == 'Random':
            state = np.random.get_state()
            np.random.seed(n * n)
            new_idxs = np.random.choice(list(remainList),
                                        size=no_points,
                                        replace=False)
            np.random.set_state(state)
            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs)

            subset_trnloader = torch.utils.data.DataLoader(
                trainset,
                batch_size=trn_batch_size,
                shuffle=False,
                sampler=SubsetRandomSampler(idxs),
                pin_memory=True)

        elif func_name == 'Facility Location':

            new_idxs = run_stochastic_Facloc(curr_X_trn, rem_predict,
                                             no_points)
            new_idxs = np.array(list(remainList))[new_idxs]

            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs)

            subset_trnloader = torch.utils.data.DataLoader(
                trainset,
                batch_size=trn_batch_size,
                shuffle=False,
                sampler=SubsetRandomSampler(idxs),
                pin_memory=True)

        else:
            new_idxs = setf_model.naive_greedy_max(curr_X_trn, rem_predict,
                                                   no_points,
                                                   clone_dict)  # , grads_idxs
            new_idxs = np.array(list(remainList))[new_idxs]

            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs)
        '''elif func_name == 'Proximal':
            previous = torch.zeros(N,device=device)
            previous[idxs] = 1.0 
            new_idxs = setf_model.naive_greedy_max(bud, clone_dict,None,previous)
            idxs = new_idxs'''

        # print("selEpoch: %d, Selection Ended at:" % (i), str(datetime.datetime.now()),file=logfile)
        # print("Naive greedy total time with taylor:", time.time()-t_ng_start,file=logfile)
        model.load_state_dict(cached_state_dict)

    # Calculate Final SubsetTrn, FullTrn, Val and Test Loss
    # Calculate Val and Test Accuracy

    if func_name == 'Facility Location':
        return val_accies, test_accies, unlab_accies, idxs, facility_loaction_warm_start
    else:
        return val_accies, test_accies, unlab_accies, idxs
Beispiel #28
0
 def call_rsample():
     return Categorical(p).rsample()
Beispiel #29
0
def select_action(policy):
    probs = policy()
    m = Categorical(probs)
    action = m.sample()
    #policy.saved_log_probs.append(m.log_prob(action))
    return m.log_prob(action), action.cpu().tolist()
 def forward(self, state):
     output = F.relu(self.linear1(state))
     output = F.relu(self.linear2(output))
     output = self.linear3(output)
     distribution = Categorical(F.softmax(output, dim=-1))
     return distribution
Beispiel #31
0
def main(args):
    #use_cuda = torch.cuda.is_available()
    use_cuda = False  # Faster on cpu
    device = torch.device("cuda:0" if use_cuda else "cpu")

    # Environment
    if args.task == 'two_step':
        task = Two_step_task(args.p_common_dist, args.r_common_dist,
                             args.p_reversal_dist)
    elif args.task == 'rocket':
        task = Rocket_task(args.p_reversal_dist, args.p_reward_reversal_dist)
    elif args.task == 'rooms_grid':
        task = Rooms_grid_task(args.room_size)
    state_dim = task.state_dim
    action_dim = task.action_dim

    # Model
    if args.model == 'LSTM':
        model = LSTM(state_dim=state_dim,
                     action_dim=action_dim,
                     hidden_dim=args.hidden_dim,
                     device=device)
    if args.load_weights_from is not None:
        model.load_state_dict(torch.load(args.load_weights_from))
    model.to(device)
    model.eval()

    # Construct empty dataframe to record testing results
    if args.task == 'two_step':
        col_names = [
            'Episode', 'Trial', 'T', 'State', 'Action', 'Reward',
            'Rewarded_state'
        ]
    elif args.task == 'rocket':
        col_names = [
            'Episode', 'Trial', 'T', 'State', 'Action', 'Reward',
            'Rewarded_state', 'Transition_regime'
        ]
    elif args.task == 'rooms_grid':
        col_names = [
            'Episode', 'Trial', 'T', 'State_row', 'State_col', 'Action',
            'Reward', 'Reward_location_row', 'Reward_location_col'
        ]
    n_cols = len(col_names)
    df = np.full_like(np.zeros([1, n_cols]), np.nan)
    row = 1  # keep track of what row we're in

    # Testing loop
    with torch.no_grad():
        for episode in range(args.episodes):
            if episode % args.print_every == 0:
                print("Starting episode: ", episode)
            env = task.sample()
            model.reinitialize()
            r, a = 0, None
            for trial in range(args.trials):
                # Reset environment for new trial
                env.init_new_trial()
                s = env.state
                done = False
                # Run a trial
                T = 0
                while not done:
                    if T > args.timeout:
                        print("Model timed out at T = ", args.timeout)
                        break
                    T += 1

                    # Add new row to dataframe
                    df = np.concatenate((df, np.zeros([1, n_cols])), axis=0)

                    # Record some data
                    df[row, 0] = episode  #episode number
                    df[row, 1] = trial  #trial number
                    df[row, 2] = T  #timestep
                    if args.task in ['two_step', 'rocket']:
                        df[row, 3] = np.nonzero(np.array(s))[0][0]  #state
                        df[row,
                           6] = env.rewarded_state  #note: recorded before step
                    elif args.task == 'rooms_grid':
                        df[row, 3] = env.state_loc[0]
                        df[row, 4] = env.state_loc[1]
                        df[row, 7] = env.reward_location[0]
                        df[row, 8] = env.reward_location[1]
                    if args.task == 'rocket':
                        df[row, 7] = env.transition_regime

                    # Convert state, previous action and previous reward to torch.tensors
                    s = torch.tensor(s).type(torch.FloatTensor).to(device)
                    a_prev = torch.zeros(action_dim,
                                         dtype=torch.float).to(device)
                    if a is not None:
                        a_prev[a] = 1
                    r_prev = torch.tensor(r).type(torch.FloatTensor).to(device)

                    # Generate action and value prediction
                    probs, v = model(s, a_prev, r_prev)
                    m = Categorical(probs)
                    a = m.sample()

                    # Take a step in the environment
                    s, r, done = env.step(a)

                    # Record the rest of the row's data
                    if args.task in ['two_step', 'rocket']:
                        df[row, 4] = a.item()  #action
                        df[row, 5] = r  #reward
                    elif args.task == 'rooms_grid':
                        df[row, 5] = a.item()
                        df[row, 6] = r

                    # Update row
                    row += 1

    # Write output file
    df = df[1:, :]  # Remove first row of nans
    print("Writing results to: ", args.out_data_file)
    np.save(args.out_data_file, df)
Beispiel #32
0
 def get_entropy(self, obs):
     probs = self.policy_net(obs)
     m = Categorical(probs)
     return m.entropy()
Beispiel #33
0
 def action_dist(self, state):
     state = torch.tensor(state).float().to(DEVICE)
     return Categorical(F.softmax(self.forward(state), -1))
Beispiel #34
0
    batch_size = 128

    loss_curve = []
    reward_list = []
    for i in trange(300):
        all_reward = 0
        state = env.reset()
        saved_log_probs = []
        rewards = []

        while True:
            state = torch.from_numpy(state).float().unsqueeze(0)

            #根据概率选择一个action
            probs = policy(state)
            m = Categorical(probs)
            action = m.sample()
            saved_log_probs.append(m.log_prob(action))

            #与环境交互
            next_state, reward, done, info = env.step(action.item())
            rewards.append(reward)

            all_reward += reward

            if done:
                reward_list.append(all_reward)
                break
            state = next_state

        policy_loss = []
Beispiel #35
0
 def forward(self, state):
     state = torch.tensor(state).float().to(DEVICE)
     state = self.conv(state)
     state = state.view(state.size(0), -1)
     action_logit = self.fc(state)
     return Categorical(F.softmax(action_logit, -1))
    def forward(self, jobs, machines, allocable_jobs=None, allocable_machines=None, argmax=False):
        job_input_size = jobs.size(0)
        machine_input_size = machines.size(0)

        E_j, E_m = self.get_embedding(jobs, machines)

        g_1 = self.last_j
        j_logits = self.j_att(g_1, E_j)

        ### selecting processes
        if allocable_jobs is not None:
            x = []
            for _ in range(job_input_size):
                if _ not in allocable_jobs:
                    x.append(_)
            if len(x) > 0:
                mask = torch.from_numpy(np.array(x, dtype=int))
                #print("MASK!", mask)
                j_logits[mask] = -1e8

        job_softmax = torch.softmax(j_logits, 0)
        job_sampler = Categorical(job_softmax)


        if argmax:
            selected_job = torch.argmax(job_softmax)
        else:
            try:
                selected_job = job_sampler.sample()
            except:
                print("ERROR at job_logits!!")
                print("g_1")
                print(g_1)
                print("E_J")
                print(E_j)
                print("job_logits")
                print(j_logits)
                print("job_softmax")
                print(job_softmax)
                raise UnboundLocalError;


        as_i = int(selected_job.detach().numpy())

        e_js = E_j[selected_job]
        m_logits = self.m_att(e_js, E_m)
        self.last_j = e_js.detach()
        ### selecting process
        if allocable_machines is not None:
            x = []
            for _ in range(machine_input_size):
                if _ not in allocable_machines[as_i]:
                    x.append(_)
            mask = torch.from_numpy(np.array(x, dtype=int))
            m_logits[mask] = -1e8

        machine_softmax = torch.softmax(m_logits, 0)
        #machine_softmax[machine_softmax < 0] = 0
        machine_sampler = Categorical(machine_softmax)
        if argmax:
            selected_machine = torch.argmax(machine_softmax, -1)
        else:
            selected_machine = machine_sampler.sample()


        logpas = machine_sampler.log_prob(selected_machine) + job_sampler.log_prob(selected_job)
        return (int(selected_job.detach().numpy()), int(selected_machine.detach().numpy())), logpas
Beispiel #37
0
 def forward(self, state):
     state = torch.tensor(state).float().to(DEVICE)
     out = self.common(state)
     action_logit, value = self.action_head(out), self.value_head(out)
     return Categorical(F.softmax(action_logit, -1)), value
Beispiel #38
0
class Multinomial(Distribution):
    r"""
    Creates a Multinomial distribution parameterized by `total_count` and
    either `probs` or `logits` (but not both). The innermost dimension of
    `probs` indexes over categories. All other dimensions index over batches.

    Note that `total_count` need not be specified if only :meth:`log_prob` is
    called (see example below)

    .. note:: :attr:`probs` will be normalized to be summing to 1.

    -   :meth:`sample` requires a single shared `total_count` for all
        parameters and samples.
    -   :meth:`log_prob` allows different `total_count` for each parameter and
        sample.

    Example::

        >>> m = Multinomial(100, torch.tensor([ 1., 1., 1., 1.]))
        >>> x = m.sample()  # equal probability of 0, 1, 2, 3
        tensor([ 21.,  24.,  30.,  25.])

        >>> Multinomial(probs=torch.tensor([1., 1., 1., 1.])).log_prob(x)
        tensor([-4.1338])

    Args:
        total_count (int): number of trials
        probs (Tensor): event probabilities
        logits (Tensor): event log probabilities
    """
    arg_constraints = {'logits': constraints.real}  # Let logits be the canonical parameterization.

    @property
    def mean(self):
        return self.probs * self.total_count

    @property
    def variance(self):
        return self.total_count * self.probs * (1 - self.probs)

    def __init__(self, total_count=1, probs=None, logits=None, validate_args=None):
        if not isinstance(total_count, Number):
            raise NotImplementedError('inhomogeneous total_count is not supported')
        self.total_count = total_count
        self._categorical = Categorical(probs=probs, logits=logits)
        batch_shape = self._categorical.batch_shape
        event_shape = self._categorical.param_shape[-1:]
        super(Multinomial, self).__init__(batch_shape, event_shape, validate_args=validate_args)

    def _new(self, *args, **kwargs):
        return self._categorical._new(*args, **kwargs)

    @constraints.dependent_property
    def support(self):
        return constraints.integer_interval(0, self.total_count)

    @property
    def logits(self):
        return self._categorical.logits

    @property
    def probs(self):
        return self._categorical.probs

    @property
    def param_shape(self):
        return self._categorical.param_shape

    def sample(self, sample_shape=torch.Size()):
        sample_shape = torch.Size(sample_shape)
        samples = self._categorical.sample(torch.Size((self.total_count,)) + sample_shape)
        # samples.shape is (total_count, sample_shape, batch_shape), need to change it to
        # (sample_shape, batch_shape, total_count)
        shifted_idx = list(range(samples.dim()))
        shifted_idx.append(shifted_idx.pop(0))
        samples = samples.permute(*shifted_idx)
        counts = samples.new(self._extended_shape(sample_shape)).zero_()
        counts.scatter_add_(-1, samples, torch.ones_like(samples))
        return counts.type_as(self.probs)

    def log_prob(self, value):
        if self._validate_args:
            self._validate_sample(value)
        logits, value = broadcast_all(self.logits.clone(), value)
        log_factorial_n = torch.lgamma(value.sum(-1) + 1)
        log_factorial_xs = torch.lgamma(value + 1).sum(-1)
        logits[(value == 0) & (logits == -float('inf'))] = 0
        log_powers = (logits * value).sum(-1)
        return log_factorial_n - log_factorial_xs + log_powers
Beispiel #39
0
def main(args):
    # Build data loader
    if not os.path.isdir(args.model_path):
        os.makedirs(args.model_path)

    data_loader,ds_class = get_loader(args.data_dir,args.batch_size,
                             shuffle=True, num_workers=args.num_workers, ds = args.ds) 

    # Build eval data loader
    if hasattr(ds_class, 'lbl2id'):
        eval_data_loader,_ = get_loader(args.data_dir_test, args.batch_size,
                             shuffle=True, num_workers=args.num_workers, ds = args.ds, lbl2id = ds_class.lbl2id) 
    else:
        eval_data_loader,_ = get_loader(args.data_dir_test, args.batch_size,
                             shuffle=True, num_workers=args.num_workers, ds = args.ds)

    # Loss and Optimizer
    model_base = SkeletonAction(args.input_size, args.hidden_size, args.num_class, args.num_action, args.num_layers, dropout = args.dropout)
    model_value = ValueNetwork( args.hidden_size )
    model_policy = PolicyNetwork( args.hidden_size, args.num_action )
    model_c = CoreClassification( args.hidden_size, args.num_class )
    criterion = nn.CrossEntropyLoss()
    criterion_value = nn.SmoothL1Loss()

    if torch.cuda.is_available():
        model_base.cuda()
        model_value.cuda()
        model_policy.cuda()
        model_c.cuda()
        criterion = criterion.cuda()
        criterion_value = criterion_value.cuda()

    params = list(model_base.parameters()) + list(model_c.parameters()) + list(model_value.parameters()) \
             + list(model_policy.parameters())
    #opt = torch.optim.Adam(params, lr=args.learning_rate, weight_decay = args.weight_decay)
    opt = torch.optim.Adam(params, lr=args.learning_rate)
    #opt_value = torch.optim.Adam(model_value.parameters(), lr = args.learning_rate)
    #opt_policy = torch.optim.Adam(model_policy.parameters(), lr = args.learning_rate)
    #opt_c = torch.optim.Adam(model_c.parameters(), lr = args.learning_rate)

    # Load the trained model parameters
    # Now, we try to find the latest encoder and decoder model.
    if os.path.isdir(args.model_path) and os.listdir(args.model_path):
        m_fn = max(glob.glob(os.path.join(args.model_path, 'model*')), key = os.path.getctime)
        if m_fn:
            logging.info("Loading model from %s", m_fn)
            model.load_state_dict(torch.load(m_fn))

    # Train the Models
    total_step = len(data_loader)
    # Initialize some variables.
    h_tensor = torch.zeros(args.batch_size, args.hidden_size)
    if torch.cuda.is_available():
        h_tensor = h_tensor.cuda()
    for epoch in range(args.num_epochs):
        total_train = 0
        total_correct = 0
        total_train_2 = 0
        total_correct_2 = 0
        for i_step, (lbl, data, length) in enumerate(data_loader):
            # Set mini-batch dataset
            lbl = Variable(lbl.squeeze())
            data = Variable(data)
            mask = torch.zeros(data.size(0), data.size(1))
            for i,m in zip(length, mask):
                m[0:i[0]] = 1
            mask = Variable(mask)
            if torch.cuda.is_available():
                lbl = lbl.cuda()
                data = data.cuda()
                mask = mask.cuda()

            h_tensor.resize_(data.size(0), data.size(1)) 
            init_h = Variable(h_tensor)
            init_hs = [ init_h for i in range( args.num_layers ) ]
            init_cs = init_hs

            zero = torch.zeros(data.size(0),)
            zero = Variable(zero)
            if torch.cuda.is_available():
                zero = zero.cuda()
 
            hs = []
            action_probs = []
            actions = []
            ht, ct = model_base( data[:,0,:], zero, init_hs, init_cs)
            hs.append(ht[-1])

            action_prob = model_policy(ht[-1])
            action_probs.append(action_prob)
            action = Categorical(action_prob)
            action = action.sample()
            actions.append(action)
            
            for j_step in range(1, data.shape[1]):
                ht, ct = model_base( data[:,j_step,:], actions[j_step-1].float(), ht, ct)
                hs.append(ht[-1])
                action_prob = model_policy(ht[-1])
                # We need to smooth the probability
                action = Categorical((action_prob + action_probs[j_step-1]) / 2)
                action = action.sample()
                actions.append(action)
                action_probs.append(action_prob)
            # now, we have finished all the actions.
            # need to bp.
            # the award only returns at the end of the episode.
            hs_t = torch.stack(hs, dim = 1)
            hs_t = (hs_t * mask.unsqueeze(2) ).sum(dim = 1) / mask.sum(dim = 1).unsqueeze(1)
            logits = model_c(hs_t) 
            #log_p = F.log_softmax(logits, dim = 1)
            loss_ent = criterion(logits, lbl)
            #loss = - (mask.squeeze() * log_p[long_idx, lbl.squeeze().data]).sum() / mask.sum()

            pred_lbl = logits.max(dim = 1)[1]
            reward = Variable((pred_lbl.data == lbl.data).float())
            reward = reward.view(data.size(0), 1)
            reward = reward.repeat(1, data.size(1))
            loss_value = []
            loss_policy = []

            actions = torch.stack(actions, dim = 1)
            action_probs = torch.stack(action_probs, dim = 1)
            hs = torch.stack(hs, dim = 1)
            hs = hs.view(-1, hs.size(-1))
            exp_reward = model_value(hs)
            exp_reward = exp_reward.view(data.size(0), data.size(1))
            loss_value =( exp_reward - reward ) ** 2
            loss_value = (loss_value * mask).sum() / mask.sum()
            pdb.set_trace()
            advantage = reward - Variable(exp_reward.data)
            idx = torch.LongTensor(range(data.size(0)))
            idx = idx.view(data.size(0), 1)
            idx = idx.repeat(1, data.size(1))
            idx = idx.view(data.size(0) * data.size(1))
            if torch.cuda.is_available():
                idx = idx.cuda()
            action_probs = action_probs.view(action_probs.size(0) * action_probs.size(1),action_probs.size(-1))
            actions = actions.view(actions.size(0) * actions.size(1))
            log_prob = action_probs[idx, actions]
            log_prob = log_prob.view(mask.size(0), mask.size(1))
            pdb.set_trace()
            loss_policy = -torch.log(log_prob + 1e-7) * mask * advantage
            pdb.set_trace()
            loss_policy = loss_policy.sum() / mask.sum()
            loss = loss_ent + loss_policy + loss_value
            
            # Now we update the value network
            #for j_step, (h, action, action_prob) in enumerate(zip(hs, actions, action_probs)):
            #    # total reward.
            #    target = reward * discount ** (data.size(0) - j_step)
            #    exp_reward = model_value(h)
            #    logging.info('exp_reward: %.4f, target: %.4f', exp_reward.mean().data[0], target.mean().data[0])
            #    l_value = criterion_value(exp_reward, target)
            #    loss_value.append( l_value )
            #    advantage = target - exp_reward
            #    c = Categorical(action_prob)
            #    l_policy = -c.log_prob(action) * advantage
            #    loss_policy.append( l_policy.mean() )
            #loss_value = torch.stack(loss_value).mean()
            #loss_policy = torch.stack(loss_policy).mean()
            #loss += loss_value + loss_policy
           

            opt.zero_grad()
            loss.backward()
            old_norm = clip_grad_norm(params, args.grad_clip)
            opt.step()
            total_train += data.size(0)
            total_correct += (pred_lbl.data.cpu().squeeze() == lbl.data.cpu().squeeze()).sum()
            # Use grad clip.
            # Eval the trained model
            #logging.info('Epoch [%d/%d], Loss: %.4f, reward: %5.4f, loss_value: %5.4f, loss_policy: %5.4f', 
            #                        epoch, args.num_epochs, 
            #                        loss_ent.data[0], reward.mean().data[0], loss_value.data[0], loss_policy.data[0])
            if i_step % args.log_step == 0:
                accuracy = total_correct * 1.0 / total_train
                logging.info('Epoch [%d/%d], Loss: %.4f, reward: %5.4f, loss_value: %5.4f, loss_policy: %5.4f, accuracy: %5.4f', 
                                    epoch, args.num_epochs, 
                                    loss_ent.data[0], reward.mean().data[0], loss_value.data[0], loss_policy.data[0], accuracy)
                #logging.info('Epoch [%d/%d], Loss: %.4f, accuracy: %5.4f, reward: %5.4f'
                #                  ,epoch, args.num_epochs, 
                #                    loss_ent.data[0], accuracy, reward.mean().data[0])

            if i_step % args.eval_step == 0:
                model_base.eval()
                model_c.eval()
                model_policy.eval()
                total_num = 0
                correct_num = 0
                for k_step, (lbl, data, length) in enumerate(eval_data_loader):
                    lbl = Variable(lbl.squeeze())
                    data = Variable(data)
                    mask = torch.zeros(data.size(0), data.size(1))
                    for i,m in zip(length, mask):
                        m[0:i[0]] = 1
                    if torch.cuda.is_available():
                        lbl = lbl.cuda()
                        data = data.cuda()
                        mask = mask.cuda()
        
                    mask = Variable(mask)

                    h_tensor.resize_(data.size(0), data.size(1)) 
                    init_h = Variable(h_tensor)
                    init_hs = [ init_h for i in range( args.num_layers ) ]
                    init_cs = init_hs


                    zero = torch.zeros(data.size(0),)
                    zero = Variable(zero)
                    if torch.cuda.is_available():
                        zero = zero.cuda()
 
                    hs = []
                    action_probs = []
                    actions = []
                    ht, ct = model_base( data[:,0,:], zero, init_hs, init_cs)
                    hs.append(ht[-1])

                    action_prob = model_policy(ht[-1])
                    action_probs.append(action_prob)
                    action = Categorical(action_prob)
                    action = action.sample()
                    actions.append(action)
                    
                    for j_step in range(1, data.shape[1]):
                        ht, ct = model_base( data[:,j_step,:], action.float(), ht, ct)
                        hs.append(ht[-1])
                        action_prob = model_policy(ht[-1])
                        action = Categorical(action_prob)
                        action = action.sample()
                        actions.append(action)
                    # now, we have finished all the actions.
                    # need to bp.
                    # the award only returns at the end of the episode.
                    hs_t = torch.stack(hs, dim = 1)
                    hs_t = (hs_t * mask.unsqueeze(2) ).sum(dim = 1) / mask.sum(dim = 1).unsqueeze(1)
                    logits = model_c(hs_t) 
                    log_p = F.log_softmax(logits, dim = 1)

                    pred_lbl = logits.max(dim = -1)[1].data.cpu()
                    total_num += data.size(0)
                    correct_num += (pred_lbl.squeeze() == lbl.data.cpu().squeeze()).sum()
                    loss = criterion(logits, lbl)
                accuracy = correct_num * 1.0 / total_num
                logging.info('Validating [%d], Loss: %.4f, accuracy: %.4f' ,epoch,
                                loss.data[0], accuracy)
         
                model_base.train()
                model_c.train()
                model_policy.eval()
                

        accuracy = total_correct * 1.0 / total_train
        logging.info('Epoch [%d/%d], Loss: %.4f, accuracy: %5.4f, reward: %5.4f'
                          ,epoch, args.num_epochs, 
                            loss_ent.data[0], accuracy, reward.mean().data[0])
class GMMExperiments(object):
    def __init__(self, n_obs, mu0, sigma0, n_clusters, hidden_dim = 30):

        # dimension of the problem
        self.dim = len(mu0)
        self.n_clusters = n_clusters
        self.n_obs = n_obs

        # prior parameters
        self.mu0 = mu0
        self.sigma0 = torch.Tensor([sigma0])
        # uniform prior on weights
        self.prior_weights = torch.ones(self.n_clusters) / self.n_clusters

        # true parameters
        self.set_true_params()
        self.cat_rv = Categorical(probs = self.prior_weights)

        # the encoder
        # self.gmm_encoder = GMMEncoder(data_dim = self.dim,
        #                      n_classes = self.n_clusters,
        #                      hidden_dim = hidden_dim)
        #
        # self.var_params = {'encoder_params': self.gmm_encoder.parameters()}


        # other variational paramters: we use point masses for
        # the means and variances
        self.set_random_var_params()

        # draw data
        self.n_obs = n_obs
        self.y, self.z = self.draw_data(n_obs = n_obs)

    def set_var_params(self, init_mu, init_log_sigma):
        self.var_params['centroids'] = init_mu
        self.var_params['log_sigma'] = init_log_sigma

    def set_random_var_params(self):
        init_mu = torch.randn((self.n_clusters, self.dim)) * self.sigma0 + self.mu0
        init_mu.requires_grad_(True)

        init_log_sigma = torch.log(torch.Tensor([self.true_sigma]))# torch.log(torch.rand(1))
        init_log_sigma.requires_grad_(True)

        self.init_free_class_weights = torch.rand((self.n_obs, self.n_clusters))
        init_free_class_weights = deepcopy(self.init_free_class_weights)
        init_free_class_weights = init_free_class_weights.requires_grad_(True)
        self.var_params = {'free_class_weights': init_free_class_weights}

        self.set_var_params(init_mu, init_log_sigma)

    def set_kmeans_init_var_params(self, n_kmeans_init = 10):

        for i in range(n_kmeans_init):
            km = KMeans(n_clusters = self.n_clusters).fit(self.y)
            enertia = km.inertia_
            if (i == 0):
                enertia_best = enertia
                km_best = deepcopy(km)
            elif (enertia < enertia_best):
                enertia_best = enertia
                km_best = deepcopy(km)

        init_free_class_weights = torch.zeros((self.n_obs, self.n_clusters))
        for n in range(len(km_best.labels_)):
            init_free_class_weights[n, km_best.labels_[n]] = 3.0

        self.init_free_class_weights = deepcopy(init_free_class_weights)

        init_free_class_weights.requires_grad_(True)
        self.var_params['free_class_weights'] = init_free_class_weights
        # init_centroids = torch.Tensor(km_best.cluster_centers_)
        # init_centroids.requires_grad_(True)
        # self.var_params['centroids'] = init_centroids

    def set_true_params(self):
        # draw means from the prior
        # each row is a cluster mean
        self.true_mus = torch.randn((self.n_clusters, self.dim)) * self.sigma0 + self.mu0

        # just set a data variance
        self.true_sigma = 1.0

    def draw_data(self, n_obs = 1):

        y = torch.zeros((n_obs, self.dim))
        z = torch.zeros(n_obs)
        for i in range(n_obs):
            # class belonging
            z_sample = self.cat_rv.sample()
            z[i] = z_sample

            # observed data
            y[i, :] = self.true_mus[z_sample, :] + torch.randn(2) * self.true_sigma

        # some indices we cache and use later
        self.seq_tensor = torch.LongTensor([i for i in range(n_obs)])

        return y, z

    def get_log_q(self):
        # self.log_class_weights = self.gmm_encoder.forward(self.y)

        fudge_lower_bdd = torch.Tensor([-8])
        self.log_class_weights = log_softmax(torch.max(self.var_params['free_class_weights'], fudge_lower_bdd)) #

        return self.log_class_weights

    def _get_centroid_mask(self, z):
        mask = torch.zeros((self.n_obs, self.n_clusters))
        mask[self.seq_tensor, z] = 1

        return mask.detach()

    def f_z(self, z):
        centroids = self.var_params['centroids'] #
        log_sigma = torch.log(torch.Tensor([self.true_sigma]))  #

        # print('centroids', centroids)
        # print('logsigma', log_sigma)
        # print('log_class_weights', self.log_class_weights)

        centroid_mask = self._get_centroid_mask(z)
        centroids_masked = torch.matmul(centroid_mask, centroids)

        loglik_z = get_normal_loglik(self.y, centroids_masked, log_sigma).sum(dim = 1)

        mu_prior_term = get_normal_loglik(centroids, self.mu0, torch.log(self.sigma0)).mean()

        z_prior_term = 0.0 # torch.log(self.prior_weights[z])

        z_entropy_term = (- torch.exp(self.log_class_weights) * self.log_class_weights).mean()

        # print('z_ent_term', z_entropy_term)
        # print('mu_prior_term', mu_prior_term)
        # print('loglik', loglik_z)

        return - (loglik_z + mu_prior_term + z_prior_term + z_entropy_term)

    def get_pm_loss(self, alpha, topk, use_baseline):
        log_q = self.get_log_q()
        pm_loss = pm_lib.get_partial_marginal_loss(self.f_z, log_q, alpha, topk,
                                    use_baseline = use_baseline,
                                    use_term_one_baseline = True)

        return pm_loss

    def get_full_loss(self):
        log_q = self.get_log_q()
        class_weights = torch.exp(log_q)
        return pm_lib.get_full_loss(self.f_z, class_weights)
Beispiel #41
0
    def hightrain(self):
        buffer, buffer_capacity, batch_size = self.highmemory.show()
        s = torch.tensor(buffer['s'], dtype=torch.double).to(self.device)
        pre_option = torch.tensor(buffer['pre_option'],
                                  dtype=torch.double).view(-1,
                                                           1).to(self.device)

        s_ = torch.tensor(buffer['s_'], dtype=torch.double).to(self.device)
        option = torch.tensor(buffer['option'],
                              dtype=torch.double).view(-1, 1).to(self.device)

        option_logp = torch.tensor(buffer['option_logp'],
                                   dtype=torch.double).view(-1,
                                                            1).to(self.device)
        r = torch.tensor(buffer['r'],
                         dtype=torch.double).view(-1, 1).to(self.device)
        done = torch.tensor(buffer['done'],
                            dtype=torch.double).view(-1, 1).to(self.device)
        action_loss_record, value_loss_record, entropy_record, loop_record = 0, 0, 0, 0

        with torch.no_grad():
            value_next = self.highnet(s_)['value']
            option_change_next = torch.where(option > 5,
                                             torch.zeros_like(option), option)
            value_next_zeros = torch.gather(value_next, 1,
                                            option_change_next.long())
            value_next = torch.where(
                option > 5,
                value_next.sum(dim=1, keepdim=True) /
                self.config.get('num_options'), value_next_zeros)

            value_now = self.highnet(s)['value']
            option_change_now = torch.where(pre_option > 5,
                                            torch.zeros_like(pre_option),
                                            pre_option)
            value_now_zeros = torch.gather(value_now, 1,
                                           option_change_now.long())
            value_now = torch.where(
                pre_option > 5,
                value_now.sum(dim=1, keepdim=True) /
                self.config.get('num_options'), value_now_zeros)

            delta = r + (
                1 - done) * self.config.get('gamma') * value_next - value_now
            adv = torch.zeros_like(delta)
            adv[-1] = delta[-1]
            # GAE
            for i in reversed(range(buffer_capacity - 1)):
                adv[i] = delta[i] + self.config.get('tau') * (
                    1 - done[i]) * adv[i + 1]

            target_v = value_now + adv
            adv = (adv - adv.mean()) / (adv.std() + np.finfo(np.float).eps
                                        )  # Normalize advantage

        for _ in range(self.config.get('ppoepoch')):
            for index in BatchSampler(
                    SubsetRandomSampler(range(buffer_capacity)), batch_size,
                    False):
                q_short, beta_short = self.highnet(
                    s[index])['q'], self.highnet(s[index])['beta']
                pre_option_short = pre_option[index]
                pi_hat_option = self.sample_option_multi(
                    q_short, beta_short, pre_option_short)
                pi_hat_p = torch.gather(pi_hat_option, 1, option[index].long())
                ratio = pi_hat_p / torch.exp(option_logp[index])
                surr1 = ratio * adv[index]
                surr2 = torch.clamp(
                    ratio, 1.0 - self.config.get('clip_param'),
                    1.0 + self.config.get('clip_param')) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                m = Categorical(pi_hat_option)
                entropy = m.entropy()

                value_now = self.highnet(s[index])['value']
                option_change_now = torch.where(
                    pre_option[index] > 5, torch.zeros_like(pre_option[index]),
                    pre_option[index])
                value_now_zeros = torch.gather(value_now, 1,
                                               option_change_now.long())
                value_now = torch.where(
                    pre_option[index] > 5,
                    value_now.sum(dim=1, keepdim=True) /
                    self.config.get('num_options'), value_now_zeros)

                value_loss = F.smooth_l1_loss(value_now, target_v[index])
                self.highoptimizition.zero_grad()
                loss = action_loss + value_loss - self.config.get(
                    'entropy_para_high') * entropy.mean()
                loss.backward()
                nn.utils.clip_grad_norm_(self.highnet.parameters(),
                                         self.config.get('max_grad_norm'))
                self.highoptimizition.step()
                action_loss_record += action_loss.cpu().detach()
                value_loss_record += value_loss.cpu().detach()
                entropy_record += entropy.mean().cpu().detach()
                loop_record += 1

        return {
            'actionloss': action_loss_record / loop_record,
            'valueloss': value_loss_record / loop_record,
            'entropy': entropy_record / loop_record,
        }
Beispiel #42
0
 def forward(self, state):
     state = torch.tensor(state).float().to(DEVICE)
     out = self.conv(state).view(state.size(0), -1)
     return Categorical(F.softmax(self.policy(out), -1)), self.value(out)
Beispiel #43
0
 def forward(self, x):
     value = self.critic(x)
     probs = self.actor(x)
     dist = Categorical(probs)
     return dist, value