def main(): # print args of current run logger.info("Welcome to Learning from Demonstration simulation") logger.info(' ' * 26 + 'Simulation Hyperparameters') for k, v in vars(args).items(): logger.info(' ' * 26 + k + ': ' + str(v)) if args.reload_data: preprocess_demonstrations(args.reload_data) model = None with torch.cuda.device(0 if args.parallel else args.cuda_default): with Experiment() as exp: if args.visualize: logger.info("Vusualize model parameters") model = exp.visualize('model_b') if args.behavioral: logger.info( "Enter Behavioral Learning Session, it might take a while") model = exp.behavioral() if args.play_behavioral: logger.info("Enter Behavioral playing, I hope it goes well") model = exp.play_behavioral() if args.lfd: logger.info("Enter LfD Session, it might take a while") model = exp.lfd() if args.train: logger.info("Enter Training Session, it might take ages") model = exp.train(model) if args.play: logger.info("Enter Testing Session, it should be fun") exp.play() logger.info("End of simulation")
def __init__(self, load_dataset=True): super(BehavioralDistAgent, self).__init__() self.meta, self.data = preprocess_demonstrations() if load_dataset: # demonstration source self.meta = divide_dataset(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.val_dataset = DemonstrationMemory("val", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.full_dataset = DemonstrationMemory("full", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.val_sampler = DemonstrationBatchSampler(self.train_dataset, train=False) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.episodic_sampler = SequentialDemonstrationSampler( self.full_dataset) self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.val_loader = torch.utils.data.DataLoader( self.val_dataset, batch_sampler=self.val_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.episodic_loader = torch.utils.data.DataLoader( self.full_dataset, sampler=self.episodic_sampler, batch_size=self.batch, num_workers=args.cpu_workers) if not self.wasserstein: self.loss_fn_vs = torch.nn.CrossEntropyLoss(size_average=True) self.loss_fn_qs = torch.nn.CrossEntropyLoss(size_average=True) self.loss_fn_vl = torch.nn.CrossEntropyLoss(size_average=True) self.loss_fn_ql = torch.nn.CrossEntropyLoss(size_average=True) else: self.loss_fn_vs = wasserstein_metric(support=args.atoms_short, n=1) self.loss_fn_qs = wasserstein_metric(support=args.atoms_short, n=1) self.loss_fn_vl = wasserstein_metric(support=args.atoms_long, n=1) self.loss_fn_ql = wasserstein_metric(support=args.atoms_long, n=1) self.histogram = torch.from_numpy(self.meta['histogram']).float() m = self.histogram.max() self.histogram = m / self.histogram self.histogram = torch.clamp(self.histogram, 0, 10).cuda() self.loss_fn_beta = torch.nn.CrossEntropyLoss(size_average=True, weight=self.histogram) self.loss_fn_pi_s = torch.nn.CrossEntropyLoss(reduce=False, size_average=True) self.loss_fn_pi_l = torch.nn.CrossEntropyLoss(reduce=False, size_average=True) self.loss_fn_pi_s_tau = torch.nn.CrossEntropyLoss(reduce=False, size_average=True) self.loss_fn_pi_l_tau = torch.nn.CrossEntropyLoss(reduce=False, size_average=True) # alpha weighted sum self.alpha_b = 1 # 1 / 0.7 self.alpha_vs = 1 # 1 / 0.02 self.alpha_qs = 1 self.alpha_vl = 1 # 1 / 0.02 self.alpha_ql = 1 self.alpha_pi_s = 1 # 1 / 0.02 self.alpha_pi_l = 1 self.alpha_pi_s_tau = 1 # 1 / 0.02 self.alpha_pi_l_tau = 1 self.model = BehavioralDistNet() self.model.cuda() # configure learning net_parameters = [ p[1] for p in self.model.named_parameters() if "rn_" in p[0] ] vl_params = [ p[1] for p in self.model.named_parameters() if "on_vl" in p[0] ] ql_params = [ p[1] for p in self.model.named_parameters() if "on_ql" in p[0] ] vs_params = [ p[1] for p in self.model.named_parameters() if "on_vs" in p[0] ] qs_params = [ p[1] for p in self.model.named_parameters() if "on_qs" in p[0] ] beta_params = [ p[1] for p in self.model.named_parameters() if "on_beta" in p[0] ] pi_s_params = [ p[1] for p in self.model.named_parameters() if "on_pi_s" in p[0] ] pi_l_params = [ p[1] for p in self.model.named_parameters() if "on_pi_l" in p[0] ] pi_tau_s_params = [ p[1] for p in self.model.named_parameters() if "on_pi_tau_s" in p[0] ] pi_tau_l_params = [ p[1] for p in self.model.named_parameters() if "on_pi_tau_l" in p[0] ] self.parameters_group_a = net_parameters + vl_params + ql_params + vs_params + qs_params + beta_params self.parameters_group_b = pi_s_params + pi_l_params + pi_tau_s_params + pi_tau_l_params # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer_vl = BehavioralDistAgent.set_optimizer( net_parameters + vl_params, args.lr_vl) self.scheduler_vl = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_vl, self.decay) self.optimizer_beta = BehavioralDistAgent.set_optimizer( net_parameters + beta_params, args.lr_beta) self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta, self.decay) self.optimizer_vs = BehavioralDistAgent.set_optimizer( net_parameters + vs_params, args.lr_vs) self.scheduler_vs = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_vs, self.decay) self.optimizer_qs = BehavioralDistAgent.set_optimizer( net_parameters + qs_params, args.lr_qs) self.scheduler_qs = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_qs, self.decay) self.optimizer_ql = BehavioralDistAgent.set_optimizer( net_parameters + ql_params, args.lr_ql) self.scheduler_ql = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_ql, self.decay) self.optimizer_pi_l = BehavioralDistAgent.set_optimizer( pi_l_params, args.lr_pi_l) self.scheduler_pi_l = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_l, self.decay) self.optimizer_pi_s = BehavioralDistAgent.set_optimizer( pi_s_params, args.lr_pi_s) self.scheduler_pi_s = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_s, self.decay) self.optimizer_pi_l_tau = BehavioralDistAgent.set_optimizer( pi_tau_l_params, args.lr_pi_tau_l) self.scheduler_pi_l_tau = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_l_tau, self.decay) self.optimizer_pi_s_tau = BehavioralDistAgent.set_optimizer( pi_tau_s_params, args.lr_pi_tau_s) self.scheduler_pi_s_tau = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_s_tau, self.decay) actions = torch.FloatTensor(consts.hotvec_matrix) / (3**(0.5)) actions = Variable(actions, requires_grad=False).cuda() self.actions_matrix = actions.unsqueeze(0) self.reverse_excitation_index = consts.hotvec_inv self.short_bins = consts.short_bins[ args.game][:-1] / self.meta['avg_score'] # the long bins are already normalized self.long_bins = consts.long_bins[args.game][:-1] self.short_bins_torch = Variable(torch.from_numpy( consts.short_bins[args.game] / self.meta['avg_score']), requires_grad=False).cuda() self.long_bins_torch = Variable(torch.from_numpy( consts.long_bins[args.game]), requires_grad=False).cuda() self.batch_range = np.arange(self.batch) self.zero = Variable(torch.zeros(1))
def __init__(self, load_dataset=True): print("Detached Agent") super(DetachedAgent, self).__init__() self.meta, self.data = preprocess_demonstrations() if load_dataset: # demonstration source self.meta = divide_dataset_by_episodes(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.norm = 2 self.loss_v_beta = torch.nn.MSELoss(size_average=True, reduce=True) self.loss_q_pi = torch.nn.MSELoss(size_average=True, reduce=True) self.loss_q_beta = torch.nn.MSELoss(size_average=True, reduce=True) self.histogram = torch.from_numpy( self.meta['histogram']).float().cuda() # weights = self.histogram.max() / self.histogram # weights = torch.clamp(weights, 0, 10) # weights = 1 - self.histogram if self.balance: self.loss_beta = torch.nn.CrossEntropyLoss(size_average=True) else: weights = self.histogram + args.balance_epsilone weights = weights.max() / weights self.loss_beta = torch.nn.CrossEntropyLoss(size_average=True, weight=weights) self.loss_pi = torch.nn.CrossEntropyLoss(reduce=False) # actor critic setting self.beta_net = DPiN().cuda() self.beta_target = DPiN().cuda() self.pi_net = DPiN().cuda() self.pi_target = DPiN().cuda() self.vb_net = DVN().cuda() self.vb_target = DVN().cuda() self.qb_net = DQN().cuda() self.qb_target = DQN().cuda() self.q_net = DQN().cuda() self.q_target = DQN().cuda() # configure learning # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer_q_pi = DetachedAgent.set_optimizer( self.q_net.parameters(), 0.0001) # 0.0002 self.scheduler_q_pi = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_q_pi, self.decay) self.optimizer_q_beta = DetachedAgent.set_optimizer( self.qb_net.parameters(), 0.001) # 0.0002 0.0001 self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_q_beta, self.decay) self.optimizer_pi = DetachedAgent.set_optimizer( self.pi_net.parameters(), 0.0002) self.scheduler_pi = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi, self.decay) self.optimizer_v_beta = DetachedAgent.set_optimizer( self.vb_net.parameters(), 0.001) # 0.0001 self.scheduler_v_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_v_beta, self.decay) self.optimizer_beta = DetachedAgent.set_optimizer( self.beta_net.parameters(), 0.01) # 0.0008 0.0006 self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta, self.decay) actions = torch.LongTensor(consts.hotvec_matrix).cuda() self.actions_matrix = Variable(actions.unsqueeze(0), requires_grad=False) self.batch_actions_matrix = self.actions_matrix.repeat( self.batch, 1, 1) self.mask_beta = Variable(torch.FloatTensor( consts.behavioral_mask[args.game]), requires_grad=False).cuda() self.mask_beta[self.mask_beta == 0] = -float("Inf") self.mask_beta[self.mask_beta == 1] = 0 self.mask_beta_batch = self.mask_beta.repeat(self.batch, 1) self.mask_q = Variable(torch.FloatTensor( consts.behavioral_mask[args.game]), requires_grad=False).cuda() self.mask_q_batch = self.mask_q.repeat(self.batch, 1) self.zero = Variable(torch.zeros(1)) self.mc = True
def __init__(self, load_dataset=True): super(ACDQNLSTMAgent, self).__init__() self.meta, self.data = preprocess_demonstrations() if load_dataset: # demonstration source self.meta = divide_dataset(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader(self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.loss_v_beta = torch.nn.L1Loss(size_average=True, reduce=True) self.loss_q_beta = torch.nn.L1Loss(size_average=True, reduce=True) self.loss_v_pi = torch.nn.L1Loss(size_average=True, reduce=True) self.loss_q_pi = torch.nn.L1Loss(size_average=True, reduce=True) self.loss_p = torch.nn.L1Loss(size_average=True, reduce=True) self.histogram = torch.from_numpy(self.meta['histogram']).float() weights = self.histogram.max() / self.histogram weights = torch.clamp(weights, 0, 10).cuda() self.loss_beta = torch.nn.CrossEntropyLoss(size_average=True) self.loss_pi = torch.nn.CrossEntropyLoss(reduce=False) # actor critic setting self.model_b_single = ACDQNLSTM().cuda() self.model_single = ACDQNLSTM().cuda() self.target_single = ACDQNLSTM().cuda() if self.parallel: self.model_b = torch.nn.DataParallel(self.model_b_single) self.model = torch.nn.DataParallel(self.model_single) self.target = torch.nn.DataParallel(self.target_single) else: self.model_b = self.model_b_single self.model = self.model_single self.target = self.target_single self.target_single.reset_target() # configure learning # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer_q_pi = ACDQNLSTMAgent.set_optimizer(self.model.parameters(), 0.0002) self.scheduler_q_pi = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_q_pi, self.decay) self.optimizer_pi = ACDQNLSTMAgent.set_optimizer(self.model.parameters(), 0.0002) self.scheduler_pi = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_pi, self.decay) self.optimizer_q_beta = ACDQNLSTMAgent.set_optimizer(self.model_b.parameters(), 0.0002) self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_q_beta, self.decay) self.optimizer_beta = ACDQNLSTMAgent.set_optimizer(self.model_b.parameters(), 0.0008) self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR(self.optimizer_beta, self.decay) actions = torch.LongTensor(consts.hotvec_matrix).cuda() self.actions_matrix = Variable(actions.unsqueeze(0), requires_grad=False) self.batch_actions_matrix = self.actions_matrix.repeat(self.batch, 1, 1) self.batch_range = np.arange(self.batch) self.zero = Variable(torch.zeros(1)) self.a_post_mat = Variable(torch.from_numpy(consts.a_post_mat).long(), requires_grad=False).cuda() self.a_post_mat = self.a_post_mat.unsqueeze(0).repeat(self.batch, 1, 1)
def __init__(self): super(LfdAgent, self).__init__() # demonstration source self.meta, self.data = preprocess_demonstrations() self.meta = divide_dataset(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.val_dataset = DemonstrationMemory("val", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.full_dataset = DemonstrationMemory("full", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.val_sampler = DemonstrationBatchSampler(self.train_dataset, train=False) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.episodic_sampler = SequentialDemonstrationSampler(self.full_dataset) self.train_loader = torch.utils.data.DataLoader(self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader(self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.val_loader = torch.utils.data.DataLoader(self.val_dataset, batch_sampler=self.val_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.episodic_loader = torch.utils.data.DataLoader(self.full_dataset, sampler=self.episodic_sampler, batch_size=self.batch, num_workers=args.cpu_workers) # set learn validate test play parameters based on arguments # configure learning if not args.value_advantage: self.learn = self.learn_q self.test = self.test_q self.player = QPlayer self.agent_type = 'q' # loss function and optimizer if self.l1_loss: self.loss_fn = torch.nn.L1Loss(size_average=True) self.individual_loss_fn = self.individual_loss_fn_l1 else: self.loss_fn = torch.nn.MSELoss(size_average=True) self.individual_loss_fn = self.individual_loss_fn_l2 # Choose a model acording to the configurations models = {(0,): DQN, (1,): DQNDueling} Model = models[(self.dueling,)] self.model_single = Model(self.action_space) self.target_single = Model(self.action_space) else: if args.value_only: self.alpha_v, self.alpha_a = 1, 0 else: self.alpha_v, self.alpha_a = 1, 1 if self.l1_loss: self.loss_fn_value = torch.nn.L1Loss(size_average=True) self.loss_fn_advantage = torch.nn.L1Loss(size_average=True) self.individual_loss_fn = self.individual_loss_fn_l1 else: self.loss_fn_value = torch.nn.MSELoss(size_average=True) self.loss_fn_advantage = torch.nn.MSELoss(size_average=True) self.individual_loss_fn = self.individual_loss_fn_l2 if not args.input_actions: self.learn = self.learn_va self.test = self.test_va self.player = AVPlayer self.agent_type = 'av' self.model_single = DVAN_ActionOut(self.action_space) self.target_single = DVAN_ActionOut(self.action_space) else: self.learn = self.learn_ava self.test = self.test_ava self.player = AVAPlayer self.agent_type = 'ava' self.model_single = DVAN_ActionIn(3) self.target_single = DVAN_ActionIn(3) # model specific parameters self.action_space = consts.action_space self.excitation = torch.LongTensor(consts.excitation_map) self.excitation_length = self.excitation.shape[0] self.mask = torch.LongTensor(consts.excitation_mask[args.game]) self.mask_dup = self.mask.unsqueeze(0).repeat(self.action_space, 1) actions = Variable(self.mask_dup * self.excitation, requires_grad=False) actions = actions.cuda() self.actions_matrix = actions.unsqueeze(0) self.actions_matrix = self.actions_matrix.repeat(self.batch, 1, 1).float() self.reverse_excitation_index = consts.reverse_excitation_index if not args.play: self.play = self.dummy_play elif args.gpu_workers == 0: self.play = self.single_play else: self.play = self.multi_play q_functions = {(0, 0): self.simple_q, (0, 1): self.double_q, (1, 0): self.simple_on_q, (1, 1): self.simple_on_q} self.q_estimator = q_functions[(self.double_q, self.on_policy)] # configure learning if args.cuda: self.model_single = self.model_single.cuda() self.target_single = self.target_single.cuda() self.model = torch.nn.DataParallel(self.model_single) self.target = torch.nn.DataParallel(self.target_single) else: self.model = self.model_single self.target = self.target_single # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer = LfdAgent.set_optimizer(self.model.parameters()) self.scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, self.decay)
def __init__(self, load_dataset=True): super(BehavioralHotAgent, self).__init__() self.meta, self.data = preprocess_demonstrations() if load_dataset: # demonstration source self.meta = divide_dataset(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.val_dataset = DemonstrationMemory("val", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.full_dataset = DemonstrationMemory("full", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.val_sampler = DemonstrationBatchSampler(self.train_dataset, train=False) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.episodic_sampler = SequentialDemonstrationSampler( self.full_dataset) self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.val_loader = torch.utils.data.DataLoader( self.val_dataset, batch_sampler=self.val_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.episodic_loader = torch.utils.data.DataLoader( self.full_dataset, sampler=self.episodic_sampler, batch_size=self.batch, num_workers=args.cpu_workers) if self.l1_loss: self.loss_fn_value = torch.nn.L1Loss(size_average=True) self.loss_fn_q = torch.nn.L1Loss(size_average=True) else: self.loss_fn_value = torch.nn.MSELoss(size_average=True) self.loss_fn_q = torch.nn.MSELoss(size_average=True) self.loss_fn_r = torch.nn.MSELoss(size_average=True) self.loss_fn_p = torch.nn.L1Loss(size_average=True) if self.weight_by_expert: self.loss_fn_beta = torch.nn.CrossEntropyLoss(reduce=False) else: self.loss_fn_beta = torch.nn.CrossEntropyLoss(reduce=True) # alpha weighted sum self.alpha_v = 1 # 1 / 0.02 self.alpha_b = 1 # 1 / 0.7 self.alpha_r = 1 # 1 / 0.7 self.alpha_p = 0 # 1 / 0.7 self.alpha_q = 1 self.model = BehavioralHotNet() self.model.cuda() # configure learning # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer_v = BehavioralHotAgent.set_optimizer( self.model.parameters(), args.lr) self.scheduler_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_v, self.decay) self.optimizer_beta = BehavioralHotAgent.set_optimizer( self.model.parameters(), args.lr_beta) self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta, self.decay) self.optimizer_q = BehavioralHotAgent.set_optimizer( self.model.parameters(), args.lr_q) self.scheduler_q = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_q, self.decay) self.optimizer_r = BehavioralHotAgent.set_optimizer( self.model.parameters(), args.lr_r) self.scheduler_r = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_r, self.decay) self.optimizer_p = BehavioralHotAgent.set_optimizer( self.model.parameters(), args.lr_p) self.scheduler_p = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_p, self.decay) self.episodic_evaluator = self.dummy_episodic_evaluator actions = torch.FloatTensor(consts.hotvec_matrix) / (3**(0.5)) actions = Variable(actions, requires_grad=False).cuda() self.actions_matrix = actions.unsqueeze(0)
def __init__(self, load_dataset=True): super(BehavioralAgent, self).__init__() self.actions_transform = np.array(consts.action2activation[args.game]) self.meta, self.data = preprocess_demonstrations() if load_dataset: # demonstration source self.meta = divide_dataset(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.val_dataset = DemonstrationMemory("val", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.full_dataset = DemonstrationMemory("full", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.val_sampler = DemonstrationBatchSampler(self.train_dataset, train=False) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.episodic_sampler = SequentialDemonstrationSampler( self.full_dataset) self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.val_loader = torch.utils.data.DataLoader( self.val_dataset, batch_sampler=self.val_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.episodic_loader = torch.utils.data.DataLoader( self.full_dataset, sampler=self.episodic_sampler, batch_size=self.batch, num_workers=args.cpu_workers) if self.l1_loss: self.loss_fn_value = torch.nn.L1Loss(size_average=True) self.individual_loss_fn_value = self.individual_loss_fn_l1 else: self.loss_fn_value = torch.nn.MSELoss(size_average=True) self.individual_loss_fn_value = self.individual_loss_fn_l2 self.loss_fn_r = torch.nn.MSELoss(size_average=True) self.individual_loss_fn_r = self.individual_loss_fn_l2 self.loss_fn_q = torch.nn.L1Loss(size_average=True) self.individual_loss_fn_q = self.individual_loss_fn_l1 self.loss_fn_p = torch.nn.L1Loss(size_average=True) self.individual_loss_fn_p = self.individual_loss_fn_l1 # self.target_single = BehavioralNet(self.global_action_space) # alpha weighted sum self.alpha_v = 1 # 1 / 0.02 self.alpha_b = 1 # 1 / 0.7 self.alpha_r = 1 # 1 / 0.7 self.alpha_p = 1 # 1 / 0.7 self.alpha_q = 1 if args.deterministic: # 1 / 0.02 self.loss_fn_beta = torch.nn.L1Loss(size_average=True) self.learn = self.learn_deterministic self.test = self.test_deterministic self.play = self.play_deterministic self.play_episode = self.play_episode_deterministic self.model_single = BehavioralNetDeterministic( self.global_action_space) else: self.loss_fn_beta = torch.nn.CrossEntropyLoss() self.learn = self.learn_stochastic self.test = self.test_stochastic self.play = self.play_stochastic self.play_episode = self.play_episode_stochastic self.model_single = BehavioralNet(self.global_action_space) # configure learning if self.cuda: self.model_single = self.model_single.cuda() # self.model = torch.nn.DataParallel(self.model_single) self.model = self.model_single # self.target_single = self.target_single.cuda() # self.target = torch.nn.DataParallel(self.target_single) else: self.model = self.model_single # self.target = self.target_single # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer_v = BehavioralAgent.set_optimizer( self.model.parameters(), args.lr) self.scheduler_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_v, self.decay) self.optimizer_beta = BehavioralAgent.set_optimizer( self.model.parameters(), args.lr_beta) self.scheduler_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta, self.decay) self.optimizer_q = BehavioralAgent.set_optimizer( self.model.parameters(), args.lr_q) self.scheduler_q = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_q, self.decay) self.optimizer_r = BehavioralAgent.set_optimizer( self.model.parameters(), args.lr_r) self.scheduler_r = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_r, self.decay) self.optimizer_p = BehavioralAgent.set_optimizer( self.model.parameters(), args.lr_p) self.scheduler_p = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_p, self.decay) self.episodic_evaluator = self.dummy_episodic_evaluator # build the action matrix # excitation = torch.LongTensor(consts.game_excitation_map[args.game]) excitation = torch.LongTensor(consts.excitation_map) mask = torch.LongTensor(consts.excitation_mask[args.game]) mask_dup = mask.unsqueeze(0).repeat(consts.action_space, 1) actions = Variable(mask_dup * excitation, requires_grad=False) actions = Variable(excitation, requires_grad=False) if args.cuda: actions = actions.cuda() self.actions_matrix = actions.unsqueeze(0) self.actions_matrix = self.actions_matrix.repeat(1, 1, 1).float() self.go_to_max = np.inf # 4096 * 8 * 4 self.reverse_excitation_index = consts.reverse_excitation_index
def __init__(self, load_dataset=True): super(BehavioralEmbeddedAgent, self).__init__() self.meta, self.data = preprocess_demonstrations() if load_dataset: # demonstration source self.meta = divide_dataset(self.meta) # datasets self.train_dataset = DemonstrationMemory("train", self.meta, self.data) self.val_dataset = DemonstrationMemory("val", self.meta, self.data) self.test_dataset = DemonstrationMemory("test", self.meta, self.data) self.full_dataset = DemonstrationMemory("full", self.meta, self.data) self.train_sampler = DemonstrationBatchSampler(self.train_dataset, train=True) self.val_sampler = DemonstrationBatchSampler(self.train_dataset, train=False) self.test_sampler = DemonstrationBatchSampler(self.test_dataset, train=False) self.episodic_sampler = SequentialDemonstrationSampler( self.full_dataset) self.train_loader = torch.utils.data.DataLoader( self.train_dataset, batch_sampler=self.train_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.test_loader = torch.utils.data.DataLoader( self.test_dataset, batch_sampler=self.test_sampler, num_workers=args.cpu_workers, pin_memory=True, drop_last=False) self.loss_v_beta = torch.nn.KLDivLoss() self.loss_q_beta = torch.nn.KLDivLoss() self.loss_v_pi = torch.nn.KLDivLoss() self.loss_q_pi = torch.nn.KLDivLoss() self.histogram = torch.from_numpy(self.meta['histogram']).float() w_f, w_v, w_h = calc_hist_weights(self.histogram) w_f = torch.clamp(w_f, 0, 10).cuda() w_v = torch.clamp(w_v, 0, 10).cuda() w_h = torch.clamp(w_h, 0, 10).cuda() self.loss_beta_f = torch.nn.CrossEntropyLoss(size_average=True, weight=w_f) self.loss_beta_v = torch.nn.CrossEntropyLoss(size_average=True, weight=w_v) self.loss_beta_h = torch.nn.CrossEntropyLoss(size_average=True, weight=w_h) self.loss_pi_f = torch.nn.CrossEntropyLoss(size_average=False) self.loss_pi_v = torch.nn.CrossEntropyLoss(size_average=False) self.loss_pi_h = torch.nn.CrossEntropyLoss(size_average=False) self.behavioral_model = BehavioralDistEmbedding() self.behavioral_model.cuda() # actor critic setting self.actor_critic_model = ActorCritic() self.actor_critic_model.cuda() self.actor_critic_target = ActorCritic() self.actor_critic_target.cuda() # configure learning cnn_params = [ p[1] for p in self.behavioral_model.named_parameters() if "cnn" in p[0] ] emb_params = [ p[1] for p in self.behavioral_model.named_parameters() if "emb" in p[0] ] v_beta_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_v" in p[0] ] a_beta_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_adv" in p[0] ] beta_f_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_beta_f" in p[0] ] beta_v_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_beta_v" in p[0] ] beta_h_params = [ p[1] for p in self.behavioral_model.named_parameters() if "fc_beta_h" in p[0] ] v_pi_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "critic_v" in p[0] ] a_pi_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "critic_adv" in p[0] ] pi_f_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "fc_actor_f" in p[0] ] pi_v_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "fc_actor_v" in p[0] ] pi_h_params = [ p[1] for p in self.actor_critic_model.named_parameters() if "fc_actor_h" in p[0] ] # IT IS IMPORTANT TO ASSIGN MODEL TO CUDA/PARALLEL BEFORE DEFINING OPTIMIZER self.optimizer_critic_v = BehavioralEmbeddedAgent.set_optimizer( v_pi_params, 0.0008) self.scheduler_critic_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_critic_v, self.decay) self.optimizer_critic_q = BehavioralEmbeddedAgent.set_optimizer( v_pi_params + a_pi_params, 0.0008) self.scheduler_critic_q = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_critic_q, self.decay) self.optimizer_v_beta = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + v_beta_params, 0.0008) self.scheduler_v_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_v_beta, self.decay) self.optimizer_q_beta = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + v_beta_params + a_beta_params, 0.0008) self.scheduler_q_beta = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_q_beta, self.decay) self.optimizer_beta_f = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + beta_f_params, 0.0008) self.scheduler_beta_f = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta_f, self.decay) self.optimizer_beta_v = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + beta_v_params, 0.0008) self.scheduler_beta_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta_v, self.decay) self.optimizer_beta_h = BehavioralEmbeddedAgent.set_optimizer( cnn_params + emb_params + beta_h_params, 0.0008) self.scheduler_beta_h = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_beta_h, self.decay) self.optimizer_pi_f = BehavioralEmbeddedAgent.set_optimizer( pi_f_params, 0.0008) self.scheduler_pi_f = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_f, self.decay) self.optimizer_pi_v = BehavioralEmbeddedAgent.set_optimizer( pi_v_params, 0.0008) self.scheduler_pi_v = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_v, self.decay) self.optimizer_pi_h = BehavioralEmbeddedAgent.set_optimizer( pi_h_params, 0.0008) self.scheduler_pi_h = torch.optim.lr_scheduler.ExponentialLR( self.optimizer_pi_h, self.decay) actions = torch.LongTensor(consts.hotvec_matrix).cuda() self.actions_matrix = actions.unsqueeze(0) self.q_bins = consts.q_bins[args.game][:-1] / self.meta['avg_score'] # the long bins are already normalized self.v_bins = consts.v_bins[args.game][:-1] / self.meta['avg_score'] self.q_bins_torch = Variable(torch.from_numpy( consts.q_bins[args.game] / self.meta['avg_score']), requires_grad=False).cuda() self.v_bins_torch = Variable(torch.from_numpy( consts.v_bins[args.game] / self.meta['avg_score']), requires_grad=False).cuda() self.batch_range = np.arange(self.batch) self.zero = Variable(torch.zeros(1))