def __init__(self, episode_buffer, replay_buffer, action_space=3): self.lr = PARAM.LEARNING_RATE self.episode_buffer = episode_buffer self.replay_buffer = replay_buffer self.N = PARAM.N self.gamma = PARAM.gamma self.seq_len = PARAM.A2C_SEQUENCE_LENGTH self.aux_batch_size = PARAM.AUX_TASK_BATCH_SIZE self.vfr_weight = PARAM.VFR_LOSS_WEIGHT self.rp_weight = PARAM.RP_LOSS_WEIGHT self.pc_weight = PARAM.PC_LOSS_WEIGHT # A2C network self.A = AuxNetwork(state_size=PARAM.STATE_SIZE, action_space=action_space, seq_len=self.seq_len) # GPU availability self.gpu = torch.cuda.is_available() if self.gpu: print("Using GPU") self.A = self.A.cuda() else: print("Using CPU") # Loss Function and Optimizer self.optimizer = optim.Adam(self.A.parameters(), lr=self.lr, weight_decay=1e-6) self.vfr_criterion = nn.MSELoss() # Value Function Replay loss self.rp_criterion = nn.CrossEntropyLoss() # Reward Prediction loss self.pc_criterion = nn.MSELoss() # Value Function Replay loss
def __init__(self, episode_buffer, replay_buffer, action_space=3): self.lr = PARAM.LEARNING_RATE self.episode_buffer = episode_buffer self.replay_buffer = replay_buffer self.N = PARAM.N self.gamma = PARAM.gamma self.seq_len = PARAM.A2C_SEQUENCE_LENGTH self.aux_batch_size = PARAM.AUX_TASK_BATCH_SIZE self.vfr_weight = PARAM.VFR_LOSS_WEIGHT self.rp_weight = PARAM.RP_LOSS_WEIGHT self.pc_weight = PARAM.PC_LOSS_WEIGHT self.ppo_epochs = 10 #PARAM.PPO_EPOCHS self.num_mini_batch = 12 #PARAM.PPO_NUM_MINI_BATCH self.clip_param = 0.2 #self.max_grad_norm = PARAM.MAX_GRAD_NORM #self.use_clipped_value_loss = PARAM.USE_CLIPPED_VALUE_LOSS # A2C network self.A = AuxNetwork(state_size=PARAM.STATE_SIZE, action_space=action_space, seq_len=self.seq_len) # GPU availability self.gpu = torch.cuda.is_available() if self.gpu: print("Using GPU") self.A = self.A.cuda() else: print("Using CPU") # Loss Function and Optimizer self.optimizer = optim.Adam(self.A.parameters(), lr=self.lr, weight_decay=1e-6) self.vfr_criterion = nn.MSELoss() # Value Function Replay loss self.rp_criterion = nn.CrossEntropyLoss() # Reward Prediction loss self.pc_criterion = nn.MSELoss() # Value Function Replay loss
def __init__(self, ReplayBuffer, action_space=3, network=None): self.lr = PARAM.LEARNING_RATE self.N = PARAM.N self.gamma = PARAM.gamma self.seq_len = PARAM.A2C_SEQUENCE_LENGTH self.aux_batch_size = PARAM.AUX_TASK_BATCH_SIZE self.vfr_weight = PARAM.VFR_LOSS_WEIGHT self.rp_weight = PARAM.RP_LOSS_WEIGHT self.pc_weight = PARAM.PC_LOSS_WEIGHT self.gpu = torch.cuda.is_available() # A2C network if PARAM.ENSEMBLE < 1: self.A = AuxNetwork(state_size=PARAM.STATE_SIZE, action_space=action_space, seq_len=self.seq_len) # GPU availability if self.gpu: print("Using GPU") self.A = self.A.cuda() else: print("Using CPU") self.replay_buffer = ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE) # Loss Function and Optimizer self.optimizer = optim.Adam(self.A.parameters(), lr=self.lr, weight_decay=1e-6) else: self.Ensemble = Ensemble(PARAM.ENSEMBLE, action_space, self.seq_len, ReplayBuffer, network) self.source_context() self.vfr_criterion = nn.MSELoss() # Value Function Replay loss self.rp_criterion = nn.CrossEntropyLoss() # Reward Prediction loss self.pc_criterion = nn.MSELoss() # Value Function Replay loss
def main(): man = Manager() if cuda.is_available(): list_of_networks = man.list([ AuxNetwork(state_size=PARAM.STATE_SIZE, action_space=3, seq_len=PARAM.A2C_SEQUENCE_LENGTH).cuda() for i in range(PARAM.ENSEMBLE) ]) else: list_of_networks = man.list([ AuxNetwork(state_size=PARAM.STATE_SIZE, action_space=3, seq_len=PARAM.A2C_SEQUENCE_LENGTH) for i in range(PARAM.ENSEMBLE) ]) args = parse_arguments() p = Pool(PARAM.AGENTS) p.map(train, [list_of_networks] * PARAM.AGENTS, chunksize=1)
def __init__(self, size, action_space, seq_len, ReplayBuffer, network): self.list_of_networks = network if network is None: self.list_of_networks = [AuxNetwork(state_size=PARAM.STATE_SIZE, action_space=3, seq_len=PARAM.A2C_SEQUENCE_LENGTH) for i in range(PARAM.ENSEMBLE)] self.gpu = torch.cuda.is_available() if self.gpu: print("Using GPU") self.list_of_networks = [network.cuda() for network in self.list_of_networks] else: print("Using CPU") self.list_of_optimizers = [optim.Adam(network.parameters(), lr=PARAM.LEARNING_RATE, weight_decay=1e-6) for network in self.list_of_networks] self.list_of_replay_buffers = [ReplayBuffer(PARAM.REPLAY_MEMORY_SIZE) for network in self.list_of_networks] self.list_of_action_repeats = PARAM.ACTION_REPEAT self.current=len(self.list_of_networks)-1 self.update_context() if PARAM.USE_ALTERNATE_SWITCHING_POLICY==True: self.analyze_rewards = self.analyze_rewards_1