def __init__(self, load_dataset=True):

        super(BehavioralDistAgent, self).__init__()

        self.meta, self.data = preprocess_demonstrations()

        if load_dataset:
            # demonstration source
            self.meta = divide_dataset(self.meta)

            # datasets
            self.train_dataset = DemonstrationMemory("train", self.meta,
                                                     self.data)
            self.val_dataset = DemonstrationMemory("val", self.meta, self.data)
            self.test_dataset = DemonstrationMemory("test", self.meta,
                                                    self.data)
            self.full_dataset = DemonstrationMemory("full", self.meta,
                                                    self.data)

            self.train_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                           train=True)
            self.val_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                         train=False)
            self.test_sampler = DemonstrationBatchSampler(self.test_dataset,
                                                          train=False)
            self.episodic_sampler = SequentialDemonstrationSampler(
                self.full_dataset)

            self.train_loader = torch.utils.data.DataLoader(
                self.train_dataset,
                batch_sampler=self.train_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.test_loader = torch.utils.data.DataLoader(
                self.test_dataset,
                batch_sampler=self.test_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.val_loader = torch.utils.data.DataLoader(
                self.val_dataset,
                batch_sampler=self.val_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)

            self.episodic_loader = torch.utils.data.DataLoader(
                self.full_dataset,
                sampler=self.episodic_sampler,
                batch_size=self.batch,
                num_workers=args.cpu_workers)

        if not self.wasserstein:
            self.loss_fn_vs = torch.nn.CrossEntropyLoss(size_average=True)
            self.loss_fn_qs = torch.nn.CrossEntropyLoss(size_average=True)
            self.loss_fn_vl = torch.nn.CrossEntropyLoss(size_average=True)
            self.loss_fn_ql = torch.nn.CrossEntropyLoss(size_average=True)
        else:
            self.loss_fn_vs = wasserstein_metric(support=args.atoms_short, n=1)
            self.loss_fn_qs = wasserstein_metric(support=args.atoms_short, n=1)
            self.loss_fn_vl = wasserstein_metric(support=args.atoms_long, n=1)
            self.loss_fn_ql = wasserstein_metric(support=args.atoms_long, n=1)

        self.histogram = torch.from_numpy(self.meta['histogram']).float()
        m = self.histogram.max()
        self.histogram = m / self.histogram
        self.histogram = torch.clamp(self.histogram, 0, 10).cuda()

        self.loss_fn_beta = torch.nn.CrossEntropyLoss(size_average=True,
                                                      weight=self.histogram)
        self.loss_fn_pi_s = torch.nn.CrossEntropyLoss(reduce=False,
                                                      size_average=True)
        self.loss_fn_pi_l = torch.nn.CrossEntropyLoss(reduce=False,
                                                      size_average=True)
        self.loss_fn_pi_s_tau = torch.nn.CrossEntropyLoss(reduce=False,
                                                          size_average=True)
        self.loss_fn_pi_l_tau = torch.nn.CrossEntropyLoss(reduce=False,
                                                          size_average=True)

        # alpha weighted sum

        self.alpha_b = 1  # 1 / 0.7

        self.alpha_vs = 1  # 1 / 0.02
        self.alpha_qs = 1

        self.alpha_vl = 1  # 1 / 0.02
        self.alpha_ql = 1

        self.alpha_pi_s = 1  # 1 / 0.02
        self.alpha_pi_l = 1

        self.alpha_pi_s_tau = 1  # 1 / 0.02
        self.alpha_pi_l_tau = 1

        self.model = BehavioralDistNet()
        self.model.cuda()

        # configure learning

        net_parameters = [
            p[1] for p in self.model.named_parameters() if "rn_" in p[0]
        ]
        vl_params = [
            p[1] for p in self.model.named_parameters() if "on_vl" in p[0]
        ]
        ql_params = [
            p[1] for p in self.model.named_parameters() if "on_ql" in p[0]
        ]
        vs_params = [
            p[1] for p in self.model.named_parameters() if "on_vs" in p[0]
        ]
        qs_params = [
            p[1] for p in self.model.named_parameters() if "on_qs" in p[0]
        ]
        beta_params = [
            p[1] for p in self.model.named_parameters() if "on_beta" in p[0]
        ]

        pi_s_params = [
            p[1] for p in self.model.named_parameters() if "on_pi_s" in p[0]
        ]
        pi_l_params = [
            p[1] for p in self.model.named_parameters() if "on_pi_l" in p[0]
        ]
        pi_tau_s_params = [
            p[1] for p in self.model.named_parameters()
            if "on_pi_tau_s" in p[0]
        ]
        pi_tau_l_params = [
            p[1] for p in self.model.named_parameters()
            if "on_pi_tau_l" in p[0]
        ]

        self.parameters_group_a = net_parameters + vl_params + ql_params + vs_params + qs_params + beta_params
        self.parameters_group_b = pi_s_params + pi_l_params + pi_tau_s_params + pi_tau_l_params

        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER
        self.optimizer_vl = BehavioralDistAgent.set_optimizer(
            net_parameters + vl_params, args.lr_vl)
        self.scheduler_vl = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_vl, self.decay)

        self.optimizer_beta = BehavioralDistAgent.set_optimizer(
            net_parameters + beta_params, args.lr_beta)
        self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta, self.decay)

        self.optimizer_vs = BehavioralDistAgent.set_optimizer(
            net_parameters + vs_params, args.lr_vs)
        self.scheduler_vs = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_vs, self.decay)

        self.optimizer_qs = BehavioralDistAgent.set_optimizer(
            net_parameters + qs_params, args.lr_qs)
        self.scheduler_qs = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_qs, self.decay)

        self.optimizer_ql = BehavioralDistAgent.set_optimizer(
            net_parameters + ql_params, args.lr_ql)
        self.scheduler_ql = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_ql, self.decay)

        self.optimizer_pi_l = BehavioralDistAgent.set_optimizer(
            pi_l_params, args.lr_pi_l)
        self.scheduler_pi_l = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_l, self.decay)

        self.optimizer_pi_s = BehavioralDistAgent.set_optimizer(
            pi_s_params, args.lr_pi_s)
        self.scheduler_pi_s = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_s, self.decay)

        self.optimizer_pi_l_tau = BehavioralDistAgent.set_optimizer(
            pi_tau_l_params, args.lr_pi_tau_l)
        self.scheduler_pi_l_tau = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_l_tau, self.decay)

        self.optimizer_pi_s_tau = BehavioralDistAgent.set_optimizer(
            pi_tau_s_params, args.lr_pi_tau_s)
        self.scheduler_pi_s_tau = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_s_tau, self.decay)

        actions = torch.FloatTensor(consts.hotvec_matrix) / (3**(0.5))
        actions = Variable(actions, requires_grad=False).cuda()

        self.actions_matrix = actions.unsqueeze(0)
        self.reverse_excitation_index = consts.hotvec_inv

        self.short_bins = consts.short_bins[
            args.game][:-1] / self.meta['avg_score']
        # the long bins are already normalized
        self.long_bins = consts.long_bins[args.game][:-1]

        self.short_bins_torch = Variable(torch.from_numpy(
            consts.short_bins[args.game] / self.meta['avg_score']),
                                         requires_grad=False).cuda()
        self.long_bins_torch = Variable(torch.from_numpy(
            consts.long_bins[args.game]),
                                        requires_grad=False).cuda()

        self.batch_range = np.arange(self.batch)

        self.zero = Variable(torch.zeros(1))
Exemple #2
0
    def __init__(self, load_dataset=True):

        super(ACDQNLSTMAgent, self).__init__()

        self.meta, self.data = preprocess_demonstrations()

        if load_dataset:
            # demonstration source
            self.meta = divide_dataset(self.meta)

            # datasets
            self.train_dataset = DemonstrationMemory("train", self.meta, self.data)
            self.test_dataset = DemonstrationMemory("test", self.meta, self.data)

            self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True)
            self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False)

            self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_sampler=self.train_sampler,
                                                            num_workers=args.cpu_workers, pin_memory=True, drop_last=False)
            self.test_loader = torch.utils.data.DataLoader(self.test_dataset, batch_sampler=self.test_sampler,
                                                           num_workers=args.cpu_workers, pin_memory=True, drop_last=False)

        self.loss_v_beta = torch.nn.L1Loss(size_average=True, reduce=True)
        self.loss_q_beta = torch.nn.L1Loss(size_average=True, reduce=True)

        self.loss_v_pi = torch.nn.L1Loss(size_average=True, reduce=True)
        self.loss_q_pi = torch.nn.L1Loss(size_average=True, reduce=True)

        self.loss_p = torch.nn.L1Loss(size_average=True, reduce=True)

        self.histogram = torch.from_numpy(self.meta['histogram']).float()
        weights = self.histogram.max() / self.histogram
        weights = torch.clamp(weights, 0, 10).cuda()

        self.loss_beta = torch.nn.CrossEntropyLoss(size_average=True)
        self.loss_pi = torch.nn.CrossEntropyLoss(reduce=False)

        # actor critic setting

        self.model_b_single = ACDQNLSTM().cuda()
        self.model_single = ACDQNLSTM().cuda()
        self.target_single = ACDQNLSTM().cuda()

        if self.parallel:
            self.model_b = torch.nn.DataParallel(self.model_b_single)
            self.model = torch.nn.DataParallel(self.model_single)
            self.target = torch.nn.DataParallel(self.target_single)
        else:
            self.model_b = self.model_b_single
            self.model = self.model_single
            self.target = self.target_single

        self.target_single.reset_target()
        # configure learning

        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER


        self.optimizer_q_pi = ACDQNLSTMAgent.set_optimizer(self.model.parameters(), 0.0002)
        self.scheduler_q_pi = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_q_pi, self.decay)

        self.optimizer_pi = ACDQNLSTMAgent.set_optimizer(self.model.parameters(), 0.0002)
        self.scheduler_pi = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_pi, self.decay)

        self.optimizer_q_beta = ACDQNLSTMAgent.set_optimizer(self.model_b.parameters(), 0.0002)
        self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_q_beta, self.decay)

        self.optimizer_beta = ACDQNLSTMAgent.set_optimizer(self.model_b.parameters(), 0.0008)
        self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_beta, self.decay)

        actions = torch.LongTensor(consts.hotvec_matrix).cuda()
        self.actions_matrix = Variable(actions.unsqueeze(0), requires_grad=False)

        self.batch_actions_matrix = self.actions_matrix.repeat(self.batch, 1, 1)

        self.batch_range = np.arange(self.batch)
        self.zero = Variable(torch.zeros(1))
        self.a_post_mat = Variable(torch.from_numpy(consts.a_post_mat).long(), requires_grad=False).cuda()
        self.a_post_mat = self.a_post_mat.unsqueeze(0).repeat(self.batch, 1, 1)
Exemple #3
0
    def __init__(self):

        super(LfdAgent, self).__init__()

        # demonstration source
        self.meta, self.data = preprocess_demonstrations()
        self.meta = divide_dataset(self.meta)

        # datasets
        self.train_dataset = DemonstrationMemory("train", self.meta, self.data)
        self.val_dataset = DemonstrationMemory("val", self.meta, self.data)
        self.test_dataset = DemonstrationMemory("test", self.meta, self.data)
        self.full_dataset = DemonstrationMemory("full", self.meta, self.data)

        self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True)
        self.val_sampler = DemonstrationBatchSampler(self.train_dataset, train=False)
        self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False)
        self.episodic_sampler = SequentialDemonstrationSampler(self.full_dataset)

        self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_sampler=self.train_sampler,
                                                        num_workers=args.cpu_workers, pin_memory=True, drop_last=False)
        self.test_loader = torch.utils.data.DataLoader(self.test_dataset, batch_sampler=self.test_sampler,
                                                       num_workers=args.cpu_workers, pin_memory=True, drop_last=False)
        self.val_loader = torch.utils.data.DataLoader(self.val_dataset, batch_sampler=self.val_sampler,
                                                      num_workers=args.cpu_workers, pin_memory=True, drop_last=False)

        self.episodic_loader = torch.utils.data.DataLoader(self.full_dataset, sampler=self.episodic_sampler,
                                                           batch_size=self.batch, num_workers=args.cpu_workers)

        # set learn validate test play parameters based on arguments
        # configure learning
        if not args.value_advantage:
            self.learn = self.learn_q
            self.test = self.test_q
            self.player = QPlayer
            self.agent_type = 'q'
            # loss function and optimizer

            if self.l1_loss:
                self.loss_fn = torch.nn.L1Loss(size_average=True)
                self.individual_loss_fn = self.individual_loss_fn_l1
            else:
                self.loss_fn = torch.nn.MSELoss(size_average=True)
                self.individual_loss_fn = self.individual_loss_fn_l2

            # Choose a model acording to the configurations
            models = {(0,): DQN, (1,): DQNDueling}
            Model = models[(self.dueling,)]

            self.model_single = Model(self.action_space)
            self.target_single = Model(self.action_space)

        else:

            if args.value_only:
                self.alpha_v, self.alpha_a = 1, 0
            else:
                self.alpha_v, self.alpha_a = 1, 1

            if self.l1_loss:
                self.loss_fn_value = torch.nn.L1Loss(size_average=True)
                self.loss_fn_advantage = torch.nn.L1Loss(size_average=True)
                self.individual_loss_fn = self.individual_loss_fn_l1
            else:
                self.loss_fn_value = torch.nn.MSELoss(size_average=True)
                self.loss_fn_advantage = torch.nn.MSELoss(size_average=True)
                self.individual_loss_fn = self.individual_loss_fn_l2

            if not args.input_actions:
                self.learn = self.learn_va
                self.test = self.test_va
                self.player = AVPlayer
                self.agent_type = 'av'
                self.model_single = DVAN_ActionOut(self.action_space)
                self.target_single = DVAN_ActionOut(self.action_space)

            else:
                self.learn = self.learn_ava
                self.test = self.test_ava
                self.player = AVAPlayer
                self.agent_type = 'ava'
                self.model_single = DVAN_ActionIn(3)
                self.target_single = DVAN_ActionIn(3)

                # model specific parameters
                self.action_space = consts.action_space
                self.excitation = torch.LongTensor(consts.excitation_map)
                self.excitation_length = self.excitation.shape[0]
                self.mask = torch.LongTensor(consts.excitation_mask[args.game])
                self.mask_dup = self.mask.unsqueeze(0).repeat(self.action_space, 1)

                actions = Variable(self.mask_dup * self.excitation, requires_grad=False)
                actions = actions.cuda()

                self.actions_matrix = actions.unsqueeze(0)
                self.actions_matrix = self.actions_matrix.repeat(self.batch, 1, 1).float()
                self.reverse_excitation_index = consts.reverse_excitation_index

        if not args.play:
            self.play = self.dummy_play
        elif args.gpu_workers == 0:
            self.play = self.single_play
        else:
            self.play = self.multi_play

        q_functions = {(0, 0): self.simple_q, (0, 1): self.double_q, (1, 0): self.simple_on_q,
                       (1, 1): self.simple_on_q}
        self.q_estimator = q_functions[(self.double_q, self.on_policy)]

        # configure learning
        if args.cuda:
            self.model_single = self.model_single.cuda()
            self.target_single = self.target_single.cuda()
            self.model = torch.nn.DataParallel(self.model_single)
            self.target = torch.nn.DataParallel(self.target_single)
        else:
            self.model = self.model_single
            self.target = self.target_single
        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER
        self.optimizer = LfdAgent.set_optimizer(self.model.parameters())
        self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, self.decay)
    def __init__(self, load_dataset=True):

        super(BehavioralHotAgent, self).__init__()

        self.meta, self.data = preprocess_demonstrations()

        if load_dataset:
            # demonstration source
            self.meta = divide_dataset(self.meta)

            # datasets
            self.train_dataset = DemonstrationMemory("train", self.meta,
                                                     self.data)
            self.val_dataset = DemonstrationMemory("val", self.meta, self.data)
            self.test_dataset = DemonstrationMemory("test", self.meta,
                                                    self.data)
            self.full_dataset = DemonstrationMemory("full", self.meta,
                                                    self.data)

            self.train_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                           train=True)
            self.val_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                         train=False)
            self.test_sampler = DemonstrationBatchSampler(self.test_dataset,
                                                          train=False)
            self.episodic_sampler = SequentialDemonstrationSampler(
                self.full_dataset)

            self.train_loader = torch.utils.data.DataLoader(
                self.train_dataset,
                batch_sampler=self.train_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.test_loader = torch.utils.data.DataLoader(
                self.test_dataset,
                batch_sampler=self.test_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.val_loader = torch.utils.data.DataLoader(
                self.val_dataset,
                batch_sampler=self.val_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)

            self.episodic_loader = torch.utils.data.DataLoader(
                self.full_dataset,
                sampler=self.episodic_sampler,
                batch_size=self.batch,
                num_workers=args.cpu_workers)

        if self.l1_loss:
            self.loss_fn_value = torch.nn.L1Loss(size_average=True)
            self.loss_fn_q = torch.nn.L1Loss(size_average=True)
        else:
            self.loss_fn_value = torch.nn.MSELoss(size_average=True)
            self.loss_fn_q = torch.nn.MSELoss(size_average=True)

        self.loss_fn_r = torch.nn.MSELoss(size_average=True)
        self.loss_fn_p = torch.nn.L1Loss(size_average=True)

        if self.weight_by_expert:
            self.loss_fn_beta = torch.nn.CrossEntropyLoss(reduce=False)
        else:
            self.loss_fn_beta = torch.nn.CrossEntropyLoss(reduce=True)

        # alpha weighted sum

        self.alpha_v = 1  # 1 / 0.02
        self.alpha_b = 1  # 1 / 0.7

        self.alpha_r = 1  # 1 / 0.7
        self.alpha_p = 0  # 1 / 0.7
        self.alpha_q = 1

        self.model = BehavioralHotNet()
        self.model.cuda()

        # configure learning

        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER
        self.optimizer_v = BehavioralHotAgent.set_optimizer(
            self.model.parameters(), args.lr)
        self.scheduler_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_v, self.decay)

        self.optimizer_beta = BehavioralHotAgent.set_optimizer(
            self.model.parameters(), args.lr_beta)
        self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta, self.decay)

        self.optimizer_q = BehavioralHotAgent.set_optimizer(
            self.model.parameters(), args.lr_q)
        self.scheduler_q = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_q, self.decay)

        self.optimizer_r = BehavioralHotAgent.set_optimizer(
            self.model.parameters(), args.lr_r)
        self.scheduler_r = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_r, self.decay)

        self.optimizer_p = BehavioralHotAgent.set_optimizer(
            self.model.parameters(), args.lr_p)
        self.scheduler_p = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_p, self.decay)

        self.episodic_evaluator = self.dummy_episodic_evaluator

        actions = torch.FloatTensor(consts.hotvec_matrix) / (3**(0.5))
        actions = Variable(actions, requires_grad=False).cuda()

        self.actions_matrix = actions.unsqueeze(0)
Exemple #5
0
    def __init__(self, load_dataset=True):

        super(BehavioralAgent, self).__init__()

        self.actions_transform = np.array(consts.action2activation[args.game])

        self.meta, self.data = preprocess_demonstrations()

        if load_dataset:
            # demonstration source
            self.meta = divide_dataset(self.meta)

            # datasets
            self.train_dataset = DemonstrationMemory("train", self.meta,
                                                     self.data)
            self.val_dataset = DemonstrationMemory("val", self.meta, self.data)
            self.test_dataset = DemonstrationMemory("test", self.meta,
                                                    self.data)
            self.full_dataset = DemonstrationMemory("full", self.meta,
                                                    self.data)

            self.train_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                           train=True)
            self.val_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                         train=False)
            self.test_sampler = DemonstrationBatchSampler(self.test_dataset,
                                                          train=False)
            self.episodic_sampler = SequentialDemonstrationSampler(
                self.full_dataset)

            self.train_loader = torch.utils.data.DataLoader(
                self.train_dataset,
                batch_sampler=self.train_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.test_loader = torch.utils.data.DataLoader(
                self.test_dataset,
                batch_sampler=self.test_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.val_loader = torch.utils.data.DataLoader(
                self.val_dataset,
                batch_sampler=self.val_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)

            self.episodic_loader = torch.utils.data.DataLoader(
                self.full_dataset,
                sampler=self.episodic_sampler,
                batch_size=self.batch,
                num_workers=args.cpu_workers)

        if self.l1_loss:
            self.loss_fn_value = torch.nn.L1Loss(size_average=True)
            self.individual_loss_fn_value = self.individual_loss_fn_l1
        else:
            self.loss_fn_value = torch.nn.MSELoss(size_average=True)
            self.individual_loss_fn_value = self.individual_loss_fn_l2

        self.loss_fn_r = torch.nn.MSELoss(size_average=True)
        self.individual_loss_fn_r = self.individual_loss_fn_l2

        self.loss_fn_q = torch.nn.L1Loss(size_average=True)
        self.individual_loss_fn_q = self.individual_loss_fn_l1

        self.loss_fn_p = torch.nn.L1Loss(size_average=True)
        self.individual_loss_fn_p = self.individual_loss_fn_l1

        # self.target_single = BehavioralNet(self.global_action_space)

        # alpha weighted sum

        self.alpha_v = 1  # 1 / 0.02
        self.alpha_b = 1  # 1 / 0.7

        self.alpha_r = 1  # 1 / 0.7
        self.alpha_p = 1  # 1 / 0.7
        self.alpha_q = 1

        if args.deterministic:  # 1 / 0.02
            self.loss_fn_beta = torch.nn.L1Loss(size_average=True)
            self.learn = self.learn_deterministic
            self.test = self.test_deterministic
            self.play = self.play_deterministic
            self.play_episode = self.play_episode_deterministic
            self.model_single = BehavioralNetDeterministic(
                self.global_action_space)

        else:
            self.loss_fn_beta = torch.nn.CrossEntropyLoss()
            self.learn = self.learn_stochastic
            self.test = self.test_stochastic
            self.play = self.play_stochastic
            self.play_episode = self.play_episode_stochastic
            self.model_single = BehavioralNet(self.global_action_space)

        # configure learning

        if self.cuda:
            self.model_single = self.model_single.cuda()
            # self.model = torch.nn.DataParallel(self.model_single)
            self.model = self.model_single
            # self.target_single = self.target_single.cuda()
            # self.target = torch.nn.DataParallel(self.target_single)
        else:
            self.model = self.model_single
            # self.target = self.target_single

        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER
        self.optimizer_v = BehavioralAgent.set_optimizer(
            self.model.parameters(), args.lr)
        self.scheduler_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_v, self.decay)

        self.optimizer_beta = BehavioralAgent.set_optimizer(
            self.model.parameters(), args.lr_beta)
        self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta, self.decay)

        self.optimizer_q = BehavioralAgent.set_optimizer(
            self.model.parameters(), args.lr_q)
        self.scheduler_q = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_q, self.decay)

        self.optimizer_r = BehavioralAgent.set_optimizer(
            self.model.parameters(), args.lr_r)
        self.scheduler_r = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_r, self.decay)

        self.optimizer_p = BehavioralAgent.set_optimizer(
            self.model.parameters(), args.lr_p)
        self.scheduler_p = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_p, self.decay)

        self.episodic_evaluator = self.dummy_episodic_evaluator

        # build the action matrix
        # excitation = torch.LongTensor(consts.game_excitation_map[args.game])
        excitation = torch.LongTensor(consts.excitation_map)
        mask = torch.LongTensor(consts.excitation_mask[args.game])
        mask_dup = mask.unsqueeze(0).repeat(consts.action_space, 1)
        actions = Variable(mask_dup * excitation, requires_grad=False)
        actions = Variable(excitation, requires_grad=False)
        if args.cuda:
            actions = actions.cuda()

        self.actions_matrix = actions.unsqueeze(0)
        self.actions_matrix = self.actions_matrix.repeat(1, 1, 1).float()

        self.go_to_max = np.inf  # 4096 * 8 * 4

        self.reverse_excitation_index = consts.reverse_excitation_index
    def __init__(self, load_dataset=True):

        super(BehavioralEmbeddedAgent, self).__init__()

        self.meta, self.data = preprocess_demonstrations()

        if load_dataset:
            # demonstration source
            self.meta = divide_dataset(self.meta)

            # datasets
            self.train_dataset = DemonstrationMemory("train", self.meta,
                                                     self.data)
            self.val_dataset = DemonstrationMemory("val", self.meta, self.data)
            self.test_dataset = DemonstrationMemory("test", self.meta,
                                                    self.data)
            self.full_dataset = DemonstrationMemory("full", self.meta,
                                                    self.data)

            self.train_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                           train=True)
            self.val_sampler = DemonstrationBatchSampler(self.train_dataset,
                                                         train=False)
            self.test_sampler = DemonstrationBatchSampler(self.test_dataset,
                                                          train=False)
            self.episodic_sampler = SequentialDemonstrationSampler(
                self.full_dataset)

            self.train_loader = torch.utils.data.DataLoader(
                self.train_dataset,
                batch_sampler=self.train_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)
            self.test_loader = torch.utils.data.DataLoader(
                self.test_dataset,
                batch_sampler=self.test_sampler,
                num_workers=args.cpu_workers,
                pin_memory=True,
                drop_last=False)

        self.loss_v_beta = torch.nn.KLDivLoss()
        self.loss_q_beta = torch.nn.KLDivLoss()

        self.loss_v_pi = torch.nn.KLDivLoss()
        self.loss_q_pi = torch.nn.KLDivLoss()

        self.histogram = torch.from_numpy(self.meta['histogram']).float()

        w_f, w_v, w_h = calc_hist_weights(self.histogram)

        w_f = torch.clamp(w_f, 0, 10).cuda()
        w_v = torch.clamp(w_v, 0, 10).cuda()
        w_h = torch.clamp(w_h, 0, 10).cuda()

        self.loss_beta_f = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_f)
        self.loss_beta_v = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_v)
        self.loss_beta_h = torch.nn.CrossEntropyLoss(size_average=True,
                                                     weight=w_h)

        self.loss_pi_f = torch.nn.CrossEntropyLoss(size_average=False)
        self.loss_pi_v = torch.nn.CrossEntropyLoss(size_average=False)
        self.loss_pi_h = torch.nn.CrossEntropyLoss(size_average=False)

        self.behavioral_model = BehavioralDistEmbedding()
        self.behavioral_model.cuda()

        # actor critic setting

        self.actor_critic_model = ActorCritic()
        self.actor_critic_model.cuda()

        self.actor_critic_target = ActorCritic()
        self.actor_critic_target.cuda()

        # configure learning

        cnn_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "cnn" in p[0]
        ]
        emb_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "emb" in p[0]
        ]

        v_beta_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_v" in p[0]
        ]
        a_beta_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_adv" in p[0]
        ]

        beta_f_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_f" in p[0]
        ]
        beta_v_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_v" in p[0]
        ]
        beta_h_params = [
            p[1] for p in self.behavioral_model.named_parameters()
            if "fc_beta_h" in p[0]
        ]

        v_pi_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "critic_v" in p[0]
        ]
        a_pi_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "critic_adv" in p[0]
        ]

        pi_f_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_f" in p[0]
        ]
        pi_v_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_v" in p[0]
        ]
        pi_h_params = [
            p[1] for p in self.actor_critic_model.named_parameters()
            if "fc_actor_h" in p[0]
        ]

        # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER

        self.optimizer_critic_v = BehavioralEmbeddedAgent.set_optimizer(
            v_pi_params, 0.0008)
        self.scheduler_critic_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_critic_v, self.decay)

        self.optimizer_critic_q = BehavioralEmbeddedAgent.set_optimizer(
            v_pi_params + a_pi_params, 0.0008)
        self.scheduler_critic_q = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_critic_q, self.decay)

        self.optimizer_v_beta = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + v_beta_params, 0.0008)
        self.scheduler_v_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_v_beta, self.decay)

        self.optimizer_q_beta = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + v_beta_params + a_beta_params, 0.0008)
        self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_q_beta, self.decay)

        self.optimizer_beta_f = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_f_params, 0.0008)
        self.scheduler_beta_f = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_f, self.decay)

        self.optimizer_beta_v = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_v_params, 0.0008)
        self.scheduler_beta_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_v, self.decay)

        self.optimizer_beta_h = BehavioralEmbeddedAgent.set_optimizer(
            cnn_params + emb_params + beta_h_params, 0.0008)
        self.scheduler_beta_h = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_beta_h, self.decay)

        self.optimizer_pi_f = BehavioralEmbeddedAgent.set_optimizer(
            pi_f_params, 0.0008)
        self.scheduler_pi_f = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_f, self.decay)

        self.optimizer_pi_v = BehavioralEmbeddedAgent.set_optimizer(
            pi_v_params, 0.0008)
        self.scheduler_pi_v = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_v, self.decay)

        self.optimizer_pi_h = BehavioralEmbeddedAgent.set_optimizer(
            pi_h_params, 0.0008)
        self.scheduler_pi_h = torch.optim.lr_scheduler.ExponentialLR(
            self.optimizer_pi_h, self.decay)

        actions = torch.LongTensor(consts.hotvec_matrix).cuda()
        self.actions_matrix = actions.unsqueeze(0)

        self.q_bins = consts.q_bins[args.game][:-1] / self.meta['avg_score']
        # the long bins are already normalized
        self.v_bins = consts.v_bins[args.game][:-1] / self.meta['avg_score']

        self.q_bins_torch = Variable(torch.from_numpy(
            consts.q_bins[args.game] / self.meta['avg_score']),
                                     requires_grad=False).cuda()
        self.v_bins_torch = Variable(torch.from_numpy(
            consts.v_bins[args.game] / self.meta['avg_score']),
                                     requires_grad=False).cuda()

        self.batch_range = np.arange(self.batch)

        self.zero = Variable(torch.zeros(1))