コード例 #1
0
checkpoint_test = {'value_test': value_test}
torch.save(
    checkpoint_test,
    r'C:\Users\Lab PC\BCVTB\examples\ePlus85-schedule\shared_model_test_test.pth'
)
checkpoint_test = torch.load(
    r'C:\Users\Lab PC\BCVTB\examples\ePlus85-schedule\shared_model_test_test.pth'
)
value_test_test = checkpoint_test['value_test']
#return policy_loss + 0.5 * value_loss
################################
after = list(model.parameters())
for i in range(len(before)):
    print(torch.equal(before[i].data, after[i].data))
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name, param.data, param.grad, param.is_leaf)

#states[0].unsqueeze(0)
for p in model.parameters():
    if p.grad is not None:
        print(p.grad.data)
#x = torch.tensor([[1., -1.], [1., 1.]], requires_grad=True)
#out = x.pow(2).sum()
#out.backward()
#x.grad

#########################################
model = ActorCritic(178, params.output_space)
optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr)
コード例 #2
0
class BehavioralEmbeddedAgent(Agent):
    def __init__(self, load_dataset=True):

        super(BehavioralEmbeddedAgent, self).__init__()

        self.meta, self.data = preprocess_demonstrations()

        if load_dataset:
            # demonstration source
            self.meta = divide_dataset(self.meta)

            # datasets
            self.train_dataset = DemonstrationMemory("train", self.meta,
                                                     self.data)
            self.val_dataset = DemonstrationMemory("val", self.meta, self.data)
            self.test_dataset = DemonstrationMemory("test", self.meta,
                                                    self.data)
            self.full_dataset = DemonstrationMemory("full", self.meta,
                                                    self.data)

            self.train_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                           train=True)
            self.val_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                         train=False)
            self.test_sampler = DemonstrationBatchSampler(self.test_dataset,
                                                          train=False)
            self.episodic_sampler = SequentialDemonstrationSampler(
                self.full_dataset)

            self.train_loader = torch.utils.data.DataLoader(
                self.train_dataset,
                batch_sampler=self.train_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.test_loader = torch.utils.data.DataLoader(
                self.test_dataset,
                batch_sampler=self.test_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)

        self.loss_v_beta = torch.nn.KLDivLoss()
        self.loss_q_beta = torch.nn.KLDivLoss()

        self.loss_v_pi = torch.nn.KLDivLoss()
        self.loss_q_pi = torch.nn.KLDivLoss()

        self.histogram = torch.from_numpy(self.meta['histogram']).float()

        w_f, w_v, w_h = calc_hist_weights(self.histogram)

        w_f = torch.clamp(w_f, 0, 10).cuda()
        w_v = torch.clamp(w_v, 0, 10).cuda()
        w_h = torch.clamp(w_h, 0, 10).cuda()

        self.loss_beta_f = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_f)
        self.loss_beta_v = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_v)
        self.loss_beta_h = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_h)

        self.loss_pi_f = torch.nn.CrossEntropyLoss(size_average=False)
        self.loss_pi_v = torch.nn.CrossEntropyLoss(size_average=False)
        self.loss_pi_h = torch.nn.CrossEntropyLoss(size_average=False)

        self.behavioral_model = BehavioralDistEmbedding()
        self.behavioral_model.cuda()

        # actor critic setting

        self.actor_critic_model = ActorCritic()
        self.actor_critic_model.cuda()

        self.actor_critic_target = ActorCritic()
        self.actor_critic_target.cuda()

        # configure learning

        cnn_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "cnn" in p[0]
        ]
        emb_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "emb" in p[0]
        ]

        v_beta_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_v" in p[0]
        ]
        a_beta_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_adv" in p[0]
        ]

        beta_f_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_f" in p[0]
        ]
        beta_v_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_v" in p[0]
        ]
        beta_h_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_h" in p[0]
        ]

        v_pi_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "critic_v" in p[0]
        ]
        a_pi_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "critic_adv" in p[0]
        ]

        pi_f_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_f" in p[0]
        ]
        pi_v_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_v" in p[0]
        ]
        pi_h_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_h" in p[0]
        ]

        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER

        self.optimizer_critic_v = BehavioralEmbeddedAgent.set_optimizer(
            v_pi_params, 0.0008)
        self.scheduler_critic_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_critic_v, self.decay)

        self.optimizer_critic_q = BehavioralEmbeddedAgent.set_optimizer(
            v_pi_params + a_pi_params, 0.0008)
        self.scheduler_critic_q = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_critic_q, self.decay)

        self.optimizer_v_beta = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + v_beta_params, 0.0008)
        self.scheduler_v_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_v_beta, self.decay)

        self.optimizer_q_beta = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + v_beta_params + a_beta_params, 0.0008)
        self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_q_beta, self.decay)

        self.optimizer_beta_f = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_f_params, 0.0008)
        self.scheduler_beta_f = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_f, self.decay)

        self.optimizer_beta_v = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_v_params, 0.0008)
        self.scheduler_beta_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_v, self.decay)

        self.optimizer_beta_h = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_h_params, 0.0008)
        self.scheduler_beta_h = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_h, self.decay)

        self.optimizer_pi_f = BehavioralEmbeddedAgent.set_optimizer(
            pi_f_params, 0.0008)
        self.scheduler_pi_f = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_f, self.decay)

        self.optimizer_pi_v = BehavioralEmbeddedAgent.set_optimizer(
            pi_v_params, 0.0008)
        self.scheduler_pi_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_v, self.decay)

        self.optimizer_pi_h = BehavioralEmbeddedAgent.set_optimizer(
            pi_h_params, 0.0008)
        self.scheduler_pi_h = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_h, self.decay)

        actions = torch.LongTensor(consts.hotvec_matrix).cuda()
        self.actions_matrix = actions.unsqueeze(0)

        self.q_bins = consts.q_bins[args.game][:-1] / self.meta['avg_score']
        # the long bins are already normalized
        self.v_bins = consts.v_bins[args.game][:-1] / self.meta['avg_score']

        self.q_bins_torch = Variable(torch.from_numpy(
            consts.q_bins[args.game] / self.meta['avg_score']),
                                     requires_grad=False).cuda()
        self.v_bins_torch = Variable(torch.from_numpy(
            consts.v_bins[args.game] / self.meta['avg_score']),
                                     requires_grad=False).cuda()

        self.batch_range = np.arange(self.batch)

        self.zero = Variable(torch.zeros(1))

    def flip_grad(self, parameters):
        for p in parameters:
            p.requires_grad = not p.requires_grad

    @staticmethod
    def individual_loss_fn_l2(argument):
        return abs(argument.data.cpu().numpy())**2

    @staticmethod
    def individual_loss_fn_l1(argument):
        return abs(argument.data.cpu().numpy())

    def save_checkpoint(self, path, aux=None):

        state = {
            'behavioral_model': self.behavioral_model.state_dict(),
            'actor_critic_model': self.actor_critic_model.state_dict(),
            'optimizer_critic_v': self.optimizer_critic_v.state_dict(),
            'optimizer_critic_q': self.optimizer_critic_q.state_dict(),
            'optimizer_v_beta': self.optimizer_v_beta.state_dict(),
            'optimizer_q_beta': self.optimizer_q_beta.state_dict(),
            'optimizer_beta_f': self.optimizer_beta_f.state_dict(),
            'optimizer_beta_v': self.optimizer_beta_v.state_dict(),
            'optimizer_beta_h': self.optimizer_beta_h.state_dict(),
            'optimizer_pi_f': self.optimizer_pi_f.state_dict(),
            'optimizer_pi_v': self.optimizer_pi_v.state_dict(),
            'optimizer_pi_h': self.optimizer_pi_h.state_dict(),
            'aux': aux
        }

        torch.save(state, path)

    def load_checkpoint(self, path):

        state = torch.load(path)
        self.behavioral_model.load_state_dict(state['behavioral_model'])
        self.actor_critic_model.load_state_dict(state['actor_critic_model'])
        self.optimizer_critic_v.load_state_dict(state['optimizer_critic_v'])
        self.optimizer_critic_q.load_state_dict(state['optimizer_critic_q'])
        self.optimizer_v_beta.load_state_dict(state['optimizer_v_beta'])
        self.optimizer_q_beta.load_state_dict(state['optimizer_q_beta'])
        self.optimizer_beta_f.load_state_dict(state['optimizer_beta_f'])
        self.optimizer_beta_v.load_state_dict(state['optimizer_beta_v'])
        self.optimizer_beta_h.load_state_dict(state['optimizer_beta_h'])
        self.optimizer_pi_f.load_state_dict(state['optimizer_pi_f'])
        self.optimizer_pi_v.load_state_dict(state['optimizer_pi_v'])
        self.optimizer_pi_h.load_state_dict(state['optimizer_pi_h'])

        return state['aux']

    def resume(self, model_path):

        aux = self.load_checkpoint(model_path)
        # self.update_target()
        return aux

    def update_target(self):
        self.actor_critic_target.load_state_dict(
            self.actor_critic_model.state_dict())

    def batched_interp(self, x, xp, fp):
        # implemented with numpy
        x = x.data.cpu().numpy()
        xp = xp.data.cpu().numpy()
        fp = fp.data.cpu().numpy()
        y = np.zeros(x.shape)

        for i, (xl, xpl, fpl) in enumerate(zip(x, xp, fp)):
            y[i] = np.interp(xl, xpl, fpl)

        return Variable(torch.FloatTensor().cuda(), requires_grad=False)

    def new_distribution(self, q, beta, r, bin):
        bin = bin.repeat(self.batch, self.global_action_space, 1)
        r = r.unsqueeze(1).repeat(1, bin.shape[0])
        beta = beta.unsqueeze(1)

        # dimensions:
        # bins [batch, actions, bins]
        # beta [batch, 1, actions]
        # new_bin = torch.baddbmm(r, beta, , alpha=self.discount)
        q_back.squeeze(1)
        return self.batched_interp(x, xp, fp)

    def learn(self, n_interval, n_tot):

        self.behavioral_model.train()
        self.actor_critic_model.train()
        self.actor_critic_target.eval()

        results = {
            'n': [],
            'loss_v': [],
            'loss_q': [],
            'loss_beta_f': [],
            'loss_beta_v': [],
            'loss_beta_h': [],
            'loss_pi_s': [],
            'loss_pi_l': [],
            'loss_pi_s_tau': [],
            'loss_pi_l_tau': []
        }

        for n, sample in tqdm(enumerate(self.train_loader)):

            s = Variable(sample['s'].cuda(), requires_grad=False)
            a = Variable(sample['a'].cuda(), requires_grad=False)

            a_index = Variable(sample['a_index'].cuda(async=True),
                               requires_grad=False)

            rl = np.digitize(sample['score'].numpy(),
                             self.long_bins,
                             right=True)
            rs = np.digitize(sample['f'].numpy(), self.short_bins, right=True)

            Rl = Variable(sample['score'].cuda(), requires_grad=False)
            Rs = Variable(sample['f'].cuda(), requires_grad=False)

            rl = Variable(torch.from_numpy(rl).cuda(), requires_grad=False)
            rs = Variable(torch.from_numpy(rs).cuda(), requires_grad=False)

            vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                s, a)

            # policy learning

            if self.alpha_vs and train_net:
                loss_vs = self.alpha_vs * self.loss_fn_vs(vs, rs)
                self.optimizer_vs.zero_grad()
                loss_vs.backward(retain_graph=True)
                self.optimizer_vs.step()
            else:
                loss_vs = self.zero

            if self.alpha_vl and train_net:
                loss_vl = self.alpha_vl * self.loss_fn_vl(vl, rl)
                self.optimizer_vl.zero_grad()
                loss_vl.backward(retain_graph=True)
                self.optimizer_vl.step()
            else:
                loss_vl = self.zero

            if self.alpha_b and train_net:
                loss_b = self.alpha_b * self.loss_fn_beta(beta, a_index)
                self.optimizer_beta.zero_grad()
                loss_b.backward(retain_graph=True)
                self.optimizer_beta.step()
            else:
                loss_b = self.zero

            if self.alpha_qs and train_net:
                loss_qs = self.alpha_qs * self.loss_fn_qs(qs, rs)
                self.optimizer_qs.zero_grad()
                loss_qs.backward(retain_graph=True)
                self.optimizer_qs.step()
            else:
                loss_qs = self.zero

            if self.alpha_ql and train_net:
                loss_ql = self.alpha_ql * self.loss_fn_ql(ql, rl)
                self.optimizer_ql.zero_grad()
                loss_ql.backward(retain_graph=True)
                self.optimizer_ql.step()
            else:
                loss_ql = self.zero

            a_index_np = sample['a_index'].numpy()
            self.batch_range = np.arange(self.batch)

            beta_sfm = F.softmax(beta, 1)
            pi_s_sfm = F.softmax(pi_s, 1)
            pi_l_sfm = F.softmax(pi_l, 1)
            pi_s_tau_sfm = F.softmax(pi_s, 1)
            pi_l_tau_sfm = F.softmax(pi_l, 1)

            beta_fix = Variable(beta_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_fix = Variable(pi_s_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_l_fix = Variable(pi_l_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_tau_fix = Variable(pi_s_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)
            pi_l_tau_fix = Variable(pi_l_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)

            if self.alpha_pi_s and not train_net:
                loss_pi_s = self.alpha_pi_s * self.loss_fn_pi_s(pi_s, a_index)
                loss_pi_s = (loss_pi_s * Rs *
                             self.off_factor(pi_s_fix, beta_fix)).mean()
                self.optimizer_pi_s.zero_grad()
                loss_pi_s.backward(retain_graph=True)
                self.optimizer_pi_s.step()
            else:
                loss_pi_s = self.zero

            if self.alpha_pi_l and not train_net:
                loss_pi_l = self.alpha_pi_l * self.loss_fn_pi_l(pi_l, a_index)
                loss_pi_l = (loss_pi_l * Rl *
                             self.off_factor(pi_l_fix, beta_fix)).mean()
                self.optimizer_pi_l.zero_grad()
                loss_pi_l.backward(retain_graph=True)
                self.optimizer_pi_l.step()
            else:
                loss_pi_l = self.zero

            if self.alpha_pi_s_tau and not train_net:
                loss_pi_s_tau = self.alpha_pi_s_tau * self.loss_fn_pi_s_tau(
                    pi_s_tau, a_index)
                w = self.get_weighted_loss(F.softmax(qs, 1),
                                           self.short_bins_torch)
                loss_pi_s_tau = (
                    loss_pi_s_tau * w *
                    self.off_factor(pi_s_tau_fix, beta_fix)).mean()
                self.optimizer_pi_s_tau.zero_grad()
                loss_pi_s_tau.backward(retain_graph=True)
                self.optimizer_pi_s_tau.step()
            else:
                loss_pi_s_tau = self.zero

            if self.alpha_pi_l_tau and not train_net:
                loss_pi_l_tau = self.alpha_pi_l_tau * self.loss_fn_pi_l_tau(
                    pi_l_tau, a_index)
                w = self.get_weighted_loss(F.softmax(ql, 1),
                                           self.long_bins_torch)
                loss_pi_l_tau = (
                    loss_pi_l_tau * w *
                    self.off_factor(pi_l_tau_fix, beta_fix)).mean()
                self.optimizer_pi_l_tau.zero_grad()
                loss_pi_l_tau.backward()
                self.optimizer_pi_l_tau.step()
            else:
                loss_pi_l_tau = self.zero

            # add results
            results['loss_vs'].append(loss_vs.data.cpu().numpy()[0])
            results['loss_vl'].append(loss_vl.data.cpu().numpy()[0])
            results['loss_b'].append(loss_b.data.cpu().numpy()[0])
            results['loss_qs'].append(loss_qs.data.cpu().numpy()[0])
            results['loss_ql'].append(loss_ql.data.cpu().numpy()[0])
            results['loss_pi_s'].append(loss_pi_s.data.cpu().numpy()[0])
            results['loss_pi_l'].append(loss_pi_l.data.cpu().numpy()[0])
            results['loss_pi_s_tau'].append(
                loss_pi_s_tau.data.cpu().numpy()[0])
            results['loss_pi_l_tau'].append(
                loss_pi_l_tau.data.cpu().numpy()[0])
            results['n'].append(n)

            # if not n % self.update_target_interval:
            #     # self.update_target()

            # if an index is rolled more than once during update_memory_interval period, only the last occurance affect the
            if not (
                    n + 1
            ) % self.update_memory_interval and self.prioritized_replay:
                self.train_dataset.update_probabilities()

            # update a global n_step parameter

            if not (n + 1) % self.update_n_steps_interval:
                # self.train_dataset.update_n_step(n + 1)
                d = np.divmod(n + 1, self.update_n_steps_interval)[0]
                if d % 10 == 1:
                    self.flip_grad(self.parameters_group_b +
                                   self.parameters_group_a)
                    train_net = not train_net
                if d % 10 == 2:
                    self.flip_grad(self.parameters_group_b +
                                   self.parameters_group_a)
                    train_net = not train_net

                    self.scheduler_pi_s.step()
                    self.scheduler_pi_l.step()
                    self.scheduler_pi_s_tau.step()
                    self.scheduler_pi_l_tau.step()
                else:
                    self.scheduler_vs.step()
                    self.scheduler_beta.step()
                    self.scheduler_vl.step()
                    self.scheduler_qs.step()
                    self.scheduler_ql.step()

            if not (n + 1) % n_interval:
                yield results
                self.model.train()
                # self.target.eval()
                results = {key: [] for key in results}

    def off_factor(self, pi, beta):
        return torch.clamp(pi / beta, 0, 1)

    def test(self, n_interval, n_tot):

        self.model.eval()
        # self.target.eval()

        results = {
            'n': [],
            'loss_vs': [],
            'loss_b': [],
            'loss_vl': [],
            'loss_qs': [],
            'loss_ql': [],
            'act_diff': [],
            'a_agent': [],
            'a_player': [],
            'loss_pi_s': [],
            'loss_pi_l': [],
            'loss_pi_s_tau': [],
            'loss_pi_l_tau': []
        }

        for n, sample in tqdm(enumerate(self.test_loader)):

            s = Variable(sample['s'].cuda(), requires_grad=False)
            a = Variable(sample['a'].cuda().unsqueeze(1), requires_grad=False)

            a_index = Variable(sample['a_index'].cuda(async=True),
                               requires_grad=False)

            rl = np.digitize(sample['score'].numpy(),
                             self.long_bins,
                             right=True)
            rs = np.digitize(sample['f'].numpy(), self.short_bins, right=True)

            Rl = Variable(sample['score'].cuda(), requires_grad=False)
            Rs = Variable(sample['f'].cuda(), requires_grad=False)

            rl = Variable(torch.from_numpy(rl).cuda(), requires_grad=False)
            rs = Variable(torch.from_numpy(rs).cuda(), requires_grad=False)

            vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                s, a)

            qs = qs.squeeze(1)
            ql = ql.squeeze(1)

            # policy learning

            loss_vs = self.alpha_vs * self.loss_fn_vs(vs, rs)
            loss_vl = self.alpha_vl * self.loss_fn_vl(vl, rl)
            loss_b = self.alpha_b * self.loss_fn_beta(beta, a_index)
            loss_qs = self.alpha_qs * self.loss_fn_qs(qs, rs)
            loss_ql = self.alpha_ql * self.loss_fn_ql(ql, rl)

            a_index_np = sample['a_index'].numpy()
            self.batch_range = np.arange(self.batch)

            beta_sfm = F.softmax(beta, 1)
            pi_s_sfm = F.softmax(pi_s, 1)
            pi_l_sfm = F.softmax(pi_l, 1)
            pi_s_tau_sfm = F.softmax(pi_s, 1)
            pi_l_tau_sfm = F.softmax(pi_l, 1)

            beta_fix = Variable(beta_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_fix = Variable(pi_s_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_l_fix = Variable(pi_l_sfm.data[self.batch_range, a_index_np],
                                requires_grad=False)
            pi_s_tau_fix = Variable(pi_s_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)
            pi_l_tau_fix = Variable(pi_l_tau_sfm.data[self.batch_range,
                                                      a_index_np],
                                    requires_grad=False)

            loss_pi_s = self.alpha_pi_s * self.loss_fn_pi_s(pi_s, a_index)
            loss_pi_s = (loss_pi_s * Rs *
                         self.off_factor(pi_s_fix, beta_fix)).mean()

            loss_pi_l = self.alpha_pi_l * self.loss_fn_pi_l(pi_l, a_index)
            loss_pi_l = (loss_pi_l * Rl *
                         self.off_factor(pi_l_fix, beta_fix)).mean()

            loss_pi_s_tau = self.alpha_pi_s_tau * self.loss_fn_pi_s_tau(
                pi_s_tau, a_index)
            w = self.get_weighted_loss(F.softmax(qs, 1), self.short_bins_torch)
            loss_pi_s_tau = (loss_pi_s_tau * w *
                             self.off_factor(pi_s_tau_fix, beta_fix)).mean()

            loss_pi_l_tau = self.alpha_pi_l_tau * self.loss_fn_pi_l_tau(
                pi_l_tau, a_index)
            w = self.get_weighted_loss(F.softmax(ql, 1), self.long_bins_torch)
            loss_pi_l_tau = (loss_pi_l_tau * w *
                             self.off_factor(pi_l_tau_fix, beta_fix)).mean()

            # collect actions statistics
            a_index_np = a_index.data.cpu().numpy()

            _, beta_index = beta.data.cpu().max(1)
            beta_index = beta_index.numpy()
            act_diff = (a_index_np != beta_index).astype(np.int)

            # add results
            results['act_diff'].append(act_diff)
            results['a_agent'].append(beta_index)
            results['a_player'].append(a_index_np)
            results['loss_vs'].append(loss_vs.data.cpu().numpy()[0])
            results['loss_vl'].append(loss_vl.data.cpu().numpy()[0])
            results['loss_b'].append(loss_b.data.cpu().numpy()[0])
            results['loss_qs'].append(loss_qs.data.cpu().numpy()[0])
            results['loss_ql'].append(loss_ql.data.cpu().numpy()[0])
            results['loss_pi_s'].append(loss_pi_s.data.cpu().numpy()[0])
            results['loss_pi_l'].append(loss_pi_l.data.cpu().numpy()[0])
            results['loss_pi_s_tau'].append(
                loss_pi_s_tau.data.cpu().numpy()[0])
            results['loss_pi_l_tau'].append(
                loss_pi_l_tau.data.cpu().numpy()[0])
            results['n'].append(n)

            if not (n + 1) % n_interval:
                results['s'] = s.data.cpu()
                results['act_diff'] = np.concatenate(results['act_diff'])
                results['a_agent'] = np.concatenate(results['a_agent'])
                results['a_player'] = np.concatenate(results['a_player'])
                yield results
                self.model.eval()
                # self.target.eval()
                results = {key: [] for key in results}

    def play_stochastic(self, n_tot):
        raise NotImplementedError

    def play_episode(self, n_tot):

        self.model.eval()
        env = Env()

        n_human = 120
        humans_trajectories = iter(self.data)
        softmax = torch.nn.Softmax()

        # mask = torch.FloatTensor(consts.actions_mask[args.game])
        # mask = Variable(mask.cuda(), requires_grad=False)

        vsx = torch.FloatTensor(consts.short_bins[args.game])
        vlx = torch.FloatTensor(consts.long_bins[args.game])

        for i in range(n_tot):

            env.reset()
            observation = next(humans_trajectories)
            trajectory = self.data[observation]
            choices = np.arange(self.global_action_space, dtype=np.int)

            j = 0

            while not env.t:

                s = Variable(env.s.cuda(), requires_grad=False)
                vs, vl, beta, qs, ql, phi, pi_s, pi_l, pi_s_tau, pi_l_tau = self.model(
                    s, self.actions_matrix)
                beta = beta.squeeze(0)
                pi_l = pi_l.squeeze(0)
                pi_s = pi_s.squeeze(0)
                pi_l_tau = pi_l_tau.squeeze(0)
                pi_s_tau = pi_s_tau.squeeze(0)

                temp = 1

                # consider only 3 most frequent actions
                beta_np = beta.data.cpu().numpy()
                indices = np.argsort(beta_np)

                maskb = Variable(torch.FloatTensor(
                    [0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                                 requires_grad=False).cuda()
                # maskb = Variable(torch.FloatTensor([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
                #                  requires_grad=False).cuda()

                # pi = maskb * (beta / beta.max())

                pi = beta
                self.greedy = True

                beta_prob = pi

                if j < n_human:
                    a = trajectory[j, self.meta['action']]

                else:
                    eps = np.random.rand()
                    # a = np.random.choice(choices)
                    if self.greedy and eps > 0.1:
                        a = pi.data.cpu().numpy()
                        a = np.argmax(a)
                    else:
                        a = softmax(pi / temp).data.cpu().numpy()
                        a = np.random.choice(choices, p=a)

                env.step(a)

                vs = softmax(vs)
                vl = softmax(vl)
                vs = torch.sum(vsx * vs.data.cpu())
                vl = torch.sum(vlx * vl.data.cpu())

                yield {
                    'o': env.s.cpu().numpy(),
                    'vs': np.array([vs]),
                    'vl': np.array([vl]),
                    's': phi.data.cpu().numpy(),
                    'score': env.score,
                    'beta': beta_prob.data.cpu().numpy(),
                    'phi': phi.squeeze(0).data.cpu().numpy(),
                    'qs': qs.squeeze(0).data.cpu().numpy(),
                    'ql': ql.squeeze(0).data.cpu().numpy(),
                }

                j += 1

        raise StopIteration

    def policy(self, vs, vl, beta, qs, ql):
        pass