def __init__(self, device, data): self.data = data self.actor = Actor().to(device) self.critic = Critic().to(device) #self.ctarget = Critic().to(device) self.actor_opt = torch.optim.Adam(itertools.chain( self.actor.parameters()), lr=0.0001, betas=(0.0, 0.9)) self.critic_opt = torch.optim.Adam(itertools.chain( self.critic.parameters()), lr=0.001, betas=(0.0, 0.9)) def init_weights(m): if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform_(m.weight.data) self.actor.apply(init_weights) self.critic.apply(init_weights) #self.ctarget.apply(init_weights) self.device = device self.memory = ReplayMemory(1000, device=device) self.batch_size = 5 self.GAMMA = 0.99 self.count = 0
def __init__(self,device,actionsize): self.samplenet = DQN(actionsize).to(device) self.targetnet = DQN(actionsize).to(device) self.opt = torch.optim.Adam(itertools.chain(self.samplenet.parameters()),lr=0.00001,betas=(0.0,0.9)) self.device = device self.memory = ReplayMemory(1000,device=device) self.BATCH_SIZE = 10 self.GAMMA = 0.99 self.count = 0
class Sampler(): def __init__(self,device,actionsize): self.samplenet = DQN(actionsize).to(device) self.targetnet = DQN(actionsize).to(device) self.opt = torch.optim.Adam(itertools.chain(self.samplenet.parameters()),lr=0.00001,betas=(0.0,0.9)) self.device = device self.memory = ReplayMemory(1000,device=device) self.BATCH_SIZE = 10 self.GAMMA = 0.99 self.count = 0 def select_action(self, model): self.samplenet.eval() action = self.samplenet(model.conv2.weight.data.view(-1,5,5).unsqueeze(0)) return torch.max(action,1)[1] def step(self,state,action,next_state,reward,done): self.memory.push(state,action,next_state,reward,done) #don't bother if you don't have enough in memory if len(self.memory) >= self.BATCH_SIZE: self.optimize() def optimize(self): self.samplenet.train() self.targetnet.eval() s1,actions,r1,s2,d = self.memory.sample(self.BATCH_SIZE) #get old Q values and new Q values for belmont eq qvals = self.samplenet(s1) state_action_values = qvals.gather(1,actions[:,0].unsqueeze(1)) with torch.no_grad(): qvals_t = self.targetnet(s2) q1_t = qvals_t.max(1)[0].unsqueeze(1) expected_state_action_values = (q1_t * self.GAMMA) * (1-d) + r1 #LOSS IS l2 loss of belmont equation loss = torch.nn.MSELoss()(state_action_values,expected_state_action_values) self.opt.zero_grad() loss.backward() self.opt.step() if self.count % 20 == 0: self.targetnet.load_state_dict(self.samplenet.state_dict()) return loss.item()
def __init__(self, session=None, arguments=None): self.sess = session self.args = arguments # Initialize Gym environment. self.environment = gym.make(self.args.env) if self.args.env=='MountainCarContinuous-v0': input_dimensions = 2 output_dimensions = 1 elif self.args.env=='InvertedPendulum-v2': input_dimensions = 4 output_dimensions = 1 elif self.args.env=='FetchReach-v0': input_dimensions = 16 output_dimensions = 4 elif self.args.env=='FetchPush-v0': input_dimensions = 31 output_dimensions = 4 # Initialize a polivy network. self.ACModel = ActorCriticModel(input_dimensions,output_dimensions,number_layers=4,hidden_units=40,sess=session,to_train=self.args.train, env=self.args.env) # Create the actual network if self.args.weights: self.ACModel.create_policy_network(session, pretrained_weights=self.args.weights,to_train=self.args.train) else: self.ACModel.create_policy_network(session, to_train=self.args.train) # Initialize a memory replay. self.memory = ReplayMemory() # Create a trainer instance. self.trainer = Trainer(sess=session,policy=self.ACModel, environment=self.environment, memory=self.memory,args=self.args)
def __init__(self, **config): self.config = config self.n_actions = self.config["n_actions"] self.state_shape = self.config["state_shape"] self.batch_size = self.config["batch_size"] self.gamma = self.config["gamma"] self.initial_mem_size_to_train = self.config[ "initial_mem_size_to_train"] torch.manual_seed(self.config["seed"]) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.cuda.empty_cache() torch.cuda.manual_seed(self.config["seed"]) torch.cuda.manual_seed_all(self.config["seed"]) self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.memory = ReplayMemory(self.config["mem_size"], self.config["alpha"], self.config["seed"]) self.v_min = self.config["v_min"] self.v_max = self.config["v_max"] self.n_atoms = self.config["n_atoms"] self.support = torch.linspace(self.v_min, self.v_max, self.n_atoms).to(self.device) self.delta_z = (self.v_max - self.v_min) / (self.n_atoms - 1) self.offset = torch.linspace(0, (self.batch_size - 1) * self.n_atoms, self.batch_size).long() \ .unsqueeze(1).expand(self.batch_size, self.n_atoms).to(self.device) self.n_step = self.config["n_step"] self.n_step_buffer = deque(maxlen=self.n_step) self.online_model = Model(self.state_shape, self.n_actions, self.n_atoms, self.support, self.device).to(self.device) self.target_model = Model(self.state_shape, self.n_actions, self.n_atoms, self.support, self.device).to(self.device) self.hard_update_target_network() self.optimizer = Adam(self.online_model.parameters(), lr=self.config["lr"], eps=self.config["adam_eps"])
def get_player(pred_model: keras.Model, strat_model: keras.Model) -> RnnPlayer: small_replay_memory = ReplayMemory(1) small_rnn_replay_memory = RnnReplayMemory(1) prediction_network = PredictionNetwork(pred_model, small_replay_memory, 1, False) strategy_network = RnnStrategyNetwork(strat_model, small_rnn_replay_memory, 1, False) return RnnPlayer(prediction_network, strategy_network, 0.0, 0.0)
def __init__(self, session=None, arguments=None): self.sess = session self.args = arguments # Initialize Gym environment. self.environment = gym.make(self.args.env) if self.args.env == 'FetchReach-v0': input_dimensions = 16 elif self.args.env == 'FetchPush-v0': input_dimensions = 31 output_dimensions = 4 # Initialize a polivy network. # self.ACModel = ActorCriticModel(input_dimensions,output_dimensions,sess=session,to_train=self.args.train) self.PolicyModel = DAggerPolicy(input_dimensions, output_dimensions, name_scope='PolicyModel', sess=session, to_train=self.args.train) # Create the actual network if self.args.weights: self.PolicyModel.create_policy_network( session, pretrained_weights=self.args.weights, to_train=self.args.train) else: self.PolicyModel.create_policy_network(session, to_train=self.args.train) # Initialize a memory replay. self.memory = ReplayMemory() # Create a trainer instance. self.trainer = Trainer(sess=session, policy=self.PolicyModel, environment=self.environment, memory=self.memory, args=self.args)
class Generator(): def __init__(self, device, data): self.data = data self.actor = Actor().to(device) self.critic = Critic().to(device) #self.ctarget = Critic().to(device) self.actor_opt = torch.optim.Adam(itertools.chain( self.actor.parameters()), lr=0.0001, betas=(0.0, 0.9)) self.critic_opt = torch.optim.Adam(itertools.chain( self.critic.parameters()), lr=0.001, betas=(0.0, 0.9)) def init_weights(m): if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d): torch.nn.init.xavier_uniform_(m.weight.data) self.actor.apply(init_weights) self.critic.apply(init_weights) #self.ctarget.apply(init_weights) self.device = device self.memory = ReplayMemory(1000, device=device) self.batch_size = 5 self.GAMMA = 0.99 self.count = 0 def select_action(self, imgs): with torch.no_grad(): self.actor.eval() action = self.actor(imgs) return action def step(self, state, action, next_state, reward, done): self.memory.push(state, action, next_state, reward, done) if len(self.memory) >= self.batch_size: self.optimize() def optimize(self): self.actor.train() self.critic.train() #self.ctarget.eval() s1, a, r, s2, d = self.memory.sample(self.batch_size) #train the critic for reward, action in zip(r, a): qval = self.critic(action) avgQ = qval.mean().unsqueeze(0) loss = torch.nn.L1Loss()(avgQ, reward) self.critic_opt.zero_grad() loss.backward() self.critic_opt.step() #train the actor img, target = self.data[random.randint(0, len(self.data) - 1)] batch = self.actor(img) score = self.critic(batch) actor_loss = -score.mean() self.actor_opt.zero_grad() actor_loss.backward() self.actor_opt.step() #if self.count % 5 == 0: # self.ctarget.load_state_dict(self.critic.state_dict()) #self.count += 1 def save(self): torch.save(self.actor.state_dict(), os.path.join('model', 'actor.pth')) torch.save(self.critic.state_dict(), os.path.join('model', 'critic.pth'))
def main(): # create replay memories pred_memories = [ReplayMemory(prediction_replay_memory_size) for _ in range(1)] strat_memories = [RnnReplayMemory(strategy_replay_memory_size) for _ in range(1)] # create Networks pred_networks = [ PredictionNetwork(prediction_resnet(), pred_memories[0], prediction_net_batch_size, True), ] print(pred_networks[0]._neural_network.summary()) strat_networks = [ RnnStrategyNetwork(strategy_deep_lstm_resnet(), strat_memories[0], strategy_net_batch_size, True), ] print(strat_networks[0]._neural_network.summary()) # make pairs of the networks networks = list(sum(zip(pred_networks, strat_networks), ())) # give each network a name pred_network_names = [ 'normal_prediction' ] strat_network_names = [ 'normal_strategy' ] # make the same pairs as above network_names = list(sum(zip(pred_network_names, strat_network_names), ())) # create players players = [ [RnnPlayer(pred_networks[0], strat_networks[0], prediction_exploration_rate, strategy_exploration_rate) for _ in range(4)], ] # flatten players players = sum(players, []) # create one PlayerInterlayer for each player players = [ [RnnPlayerInterlayer(player, normal_pred_y_func, normal_strat_y_func) for player in players] ] players = sum(players, []) # create one Sitting sitting = Sitting(debugging) last_stop = datetime.datetime.now() r = random.Random() with open('stats_dev.txt', 'w') as f: f.write("// interval to print stats: " + str(interval_to_print_stats) + "\n") total_diff = 0 total_losses = [0.0 for _ in range(len(networks))] for i in range(start_offset, total_rounds, 10): sitting.set_players(r.sample(players, 4)) for _ in range(10): total_diff += sitting.play_full_round() i += 9 if only_train_in_turn: index_to_train = i // turn_size % len(networks) total_losses[index_to_train] += networks[index_to_train].train() else: for net_i, network in enumerate(networks): total_losses[net_i] += network.train() if (i + 1) % interval_to_print_stats == 0: print(str(i + 1), "rounds have been played") avg = total_diff / 4 / interval_to_print_stats print("Average difference of one player:\t", avg) losses_string = ', '.join([str(l) for l in np.array(total_losses) / interval_to_print_stats]) print("The losses are:\t", losses_string) print("It took:", datetime.datetime.now() - last_stop) last_stop = datetime.datetime.now() print('') f.write(str(i + 1) + "\n") f.write(str(avg) + "\n") f.write(losses_string + "\n") total_diff = 0 total_losses = [0.0 for _ in range(len(networks))] if (i + 1) % rounds_until_save == 0: for keras_net, net_name in zip(networks, network_names): if 'random' in net_name: continue elif 'pred' in net_name: full_name = prediction_save_path elif 'strat' in net_name: full_name = strategy_save_path else: assert 0, net_name full_name += net_name + '_' + str(i + 1) + '.h5' keras_net.save_network(full_name) if i + 1 == round_when_adding_players: print('adding players') # add 2 more normal players nps = [RnnPlayer(networks[-2], networks[-1], prediction_exploration_rate, strategy_exploration_rate) for _ in range(2)] inps = [RnnPlayerInterlayer(nps[i], normal_pred_y_func, normal_strat_y_func) for i in range(2)] players += inps # add 2 static versions of the current normal player pred_mem = ReplayMemory(1) strat_mem = RnnReplayMemory(1) pred_net = load_model(prediction_save_path + 'normal_prediction_' + str(i + 1) + '.h5') strat_net = load_model(strategy_save_path + 'normal_strategy_' + str(i + 1) + '.h5') p_net = PredictionNetwork(pred_net, pred_mem, 1, False) s_net = RnnStrategyNetwork(strat_net, strat_mem, 1, False) ps = [RnnPlayer(p_net, s_net, 0.02, 0.02) for _ in range(2)] ips = [RnnPlayerInterlayer(ps[i], normal_pred_y_func, normal_strat_y_func) for i in range(2)] players += ips
def __init__(self, observation_space, action_space, device, gamma=0.99, actor_lr=1e-4, critic_lr=1e-3, batch_size=64, memory_size=50000, tau=1e-3, weight_decay=1e-2, writer=None, is_image=False): super(DdpgAgent, self).__init__() self.num_state = observation_space.shape[0] self.num_action = action_space.shape[0] self.state_mean = None self.state_halfwidth = None if abs(observation_space.high[0]) != math.inf: self.state_mean = 0.5 * (observation_space.high + observation_space.low) self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low) self.gamma = gamma self.batch_size = batch_size self.device = device self.actor = ActorNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.actor_target = ActorNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay) self.memory = ReplayMemory(observation_space, action_space, device, num_state=self.num_state, memory_size=memory_size, is_image=is_image) self.criterion = nn.SmoothL1Loss() self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.tau = tau self.writer = writer self.update_step = 0 self.is_image = is_image
class Td3Agent: def __init__(self, observation_space, action_space, device, gamma=0.99, actor_lr=5e-3, critic_lr=5e-3, batch_size=100, memory_size=50000, tau=1e-3, weight_decay=1e-2, sigma=0.2, noise_clip=0.5, policy_freq=2, writer=None, is_image=False): super(Td3Agent, self).__init__() self.action_mean = (0.5 * (action_space.high + action_space.low))[0] self.action_halfwidth = (0.5 * (action_space.high - action_space.low))[0] self.num_state = observation_space.shape[0] self.num_action = action_space.shape[0] self.state_mean = None self.state_halfwidth = None if abs(observation_space.high[0]) != math.inf: self.state_mean = 0.5 * (observation_space.high + observation_space.low) self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low) self.gamma = gamma self.batch_size = batch_size self.device = device self.actor = ActorNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.actor_target = ActorNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay) self.memory = ReplayMemory(observation_space, action_space, device, num_state=self.num_state, memory_size=memory_size, is_image=is_image) self.criterion = nn.SmoothL1Loss() self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.tau = tau self.writer = writer self.update_step = 0 self.is_image = is_image self.sigma = sigma self.noise_clip = noise_clip self.policy_freq = policy_freq def normalize_state(self, state): if self.state_mean is None: return state state = (state - self.state_mean) / self.state_halfwidth return state def soft_update(self, target_net, net): for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def update(self): self.update_step += 1 with torch.no_grad(): batch, indices, probability_distribution = self.memory.random_sample( ) #各サンプルにおける状態行動の値を取ってくる action_batch = batch['actions'].to(self.device) state_batch = batch['obs'].to(self.device) next_state_batch = batch['next_obs'].clone().to(self.device) reward_batch = batch['rewards'].to(self.device) terminate_batch = batch['terminates'].to(self.device) noise = (torch.randn_like(action_batch) * self.sigma).clamp( -self.noise_clip, self.noise_clip) next_action = (self.actor_target(next_state_batch) + noise).clamp( -self.action_mean - self.action_halfwidth, self.action_mean + self.action_halfwidth) target_q1, target_q2 = self.critic_target(next_state_batch, next_action) target_q = torch.min(target_q1, target_q2) target_q_values = reward_batch + self.gamma * target_q * ( 1 - terminate_batch) self.actor.train() self.critic.train() current_q1, current_q2 = self.critic(state_batch, action_batch) #誤差の計算 critic_loss = self.criterion(current_q1, target_q_values) + self.criterion( current_q2, target_q_values) #勾配を0にリセットする self.critic_optimizer.zero_grad() #逆誤差伝搬を計算する critic_loss.backward() #勾配を更新する self.critic_optimizer.step() if self.writer and self.update_step % 1000 == 0: self.writer.add_scalar("loss/critic", critic_loss.item(), self.update_step / 1000) #print("loss/critic", critic_loss.item()) if self.update_step % self.policy_freq == 0: actor_loss = -self.critic.q1_forward( state_batch, self.actor(state_batch)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.writer and self.update_step % 1000 == 0: self.writer.add_scalar("loss/actor", actor_loss.item(), self.update_step / 1000) #print("loss/actor", actor_loss.item()) self.soft_update(self.actor_target, self.actor) self.soft_update(self.critic_target, self.critic) self.actor.eval() self.critic.eval() # Q値が最大の行動を選択 def get_action(self, state, is_noise=True): if not self.is_image: state_tensor = torch.tensor(self.normalize_state(state), dtype=torch.float).view( -1, self.num_state).to(self.device) else: state_tensor = torch.tensor(state.copy() / 255., dtype=torch.float).unsqueeze(0).to( self.device) with torch.no_grad(): action = self.actor(state_tensor).view(self.num_action) noise = np.random.normal(loc=0.0, scale=self.sigma) action_with_noise = np.clip( action.to('cpu').detach().numpy().copy() + noise, -1, 1) action = action.to('cpu').detach().numpy().copy() if not is_noise: return action return action_with_noise
def __init__( self, observation_space, action_space, device, gamma=0.995, actor_lr=5e-4, critic_lr=5e-4, batch_size=128, memory_size=50000, tau=5e-3, weight_decay=1e-2, sigma=0.2, noise_clip=0.5, alpha=0.2, alpha_lr=3e-4, rollout_length=2048, lambda_=0.95, beta_clone=1.0, coef_ent=0.01, num_updates=32, policy_epoch=1, value_epoch=1, aux_num_updates=6, aux_epoch_batch=64, max_grad_norm=0.5, aux_critic_loss_coef=1.0, clip_eps=0.2, writer=None, is_image=False, clip_aux_critic_loss=None, clip_aux_multinet_critic_loss=None, multipleet_upadte_clip_grad_norm=None, summary_interval=1, debug_no_aux_phase=False): super(PpgAgent, self).__init__() self.action_mean = (0.5 * (action_space.high + action_space.low))[0] self.action_halfwidth = (0.5 * (action_space.high - action_space.low))[0] self.num_state = observation_space.shape[0] self.num_action = action_space.shape[0] self.state_mean = None self.state_halfwidth = None if abs(observation_space.high[0]) != math.inf: self.state_mean = 0.5 * (observation_space.high + observation_space.low) self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low) self.gamma = gamma self.batch_size = batch_size self.device = device self.multipleNet = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.multipleNet_target = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.multipleNet_target.load_state_dict(self.multipleNet.state_dict()) self.multipleNet_optimizer = optim.Adam(self.multipleNet.parameters(), lr=actor_lr) self.critic = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = critic_lr, weight_decay=weight_decay) self.alpha = alpha self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr = alpha_lr) self.memory = ReplayMemory(observation_space, action_space, device, num_state = self.num_state, memory_size = memory_size, is_image = is_image) self.criterion = nn.MSELoss() self.device = device self.tau = tau self.writer = writer self.is_image =is_image self.sigma = sigma self.noise_clip = noise_clip self.rollout_length = rollout_length self.lambda_ = lambda_ self.coef_ent = coef_ent self.aux_critic_loss_coef = aux_critic_loss_coef self.max_grad_norm = max_grad_norm self.aux_num_updates = aux_num_updates self.clip_eps = clip_eps self.beta_clone = beta_clone self.policy_epoch = policy_epoch self.value_epoch = value_epoch self.num_updates = num_updates self.aux_epoch_batch = aux_epoch_batch self.clip_aux_critic_loss = clip_aux_critic_loss self.clip_aux_multinet_critic_loss = clip_aux_multinet_critic_loss self.multipleet_upadte_clip_grad_norm = multipleet_upadte_clip_grad_norm self.summary_interval = summary_interval self.debug_no_aux_phase = debug_no_aux_phase self.update_step = 0
class PpgAgent: def __init__( self, observation_space, action_space, device, gamma=0.995, actor_lr=5e-4, critic_lr=5e-4, batch_size=128, memory_size=50000, tau=5e-3, weight_decay=1e-2, sigma=0.2, noise_clip=0.5, alpha=0.2, alpha_lr=3e-4, rollout_length=2048, lambda_=0.95, beta_clone=1.0, coef_ent=0.01, num_updates=32, policy_epoch=1, value_epoch=1, aux_num_updates=6, aux_epoch_batch=64, max_grad_norm=0.5, aux_critic_loss_coef=1.0, clip_eps=0.2, writer=None, is_image=False, clip_aux_critic_loss=None, clip_aux_multinet_critic_loss=None, multipleet_upadte_clip_grad_norm=None, summary_interval=1, debug_no_aux_phase=False): super(PpgAgent, self).__init__() self.action_mean = (0.5 * (action_space.high + action_space.low))[0] self.action_halfwidth = (0.5 * (action_space.high - action_space.low))[0] self.num_state = observation_space.shape[0] self.num_action = action_space.shape[0] self.state_mean = None self.state_halfwidth = None if abs(observation_space.high[0]) != math.inf: self.state_mean = 0.5 * (observation_space.high + observation_space.low) self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low) self.gamma = gamma self.batch_size = batch_size self.device = device self.multipleNet = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.multipleNet_target = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.multipleNet_target.load_state_dict(self.multipleNet.state_dict()) self.multipleNet_optimizer = optim.Adam(self.multipleNet.parameters(), lr=actor_lr) self.critic = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = critic_lr, weight_decay=weight_decay) self.alpha = alpha self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr = alpha_lr) self.memory = ReplayMemory(observation_space, action_space, device, num_state = self.num_state, memory_size = memory_size, is_image = is_image) self.criterion = nn.MSELoss() self.device = device self.tau = tau self.writer = writer self.is_image =is_image self.sigma = sigma self.noise_clip = noise_clip self.rollout_length = rollout_length self.lambda_ = lambda_ self.coef_ent = coef_ent self.aux_critic_loss_coef = aux_critic_loss_coef self.max_grad_norm = max_grad_norm self.aux_num_updates = aux_num_updates self.clip_eps = clip_eps self.beta_clone = beta_clone self.policy_epoch = policy_epoch self.value_epoch = value_epoch self.num_updates = num_updates self.aux_epoch_batch = aux_epoch_batch self.clip_aux_critic_loss = clip_aux_critic_loss self.clip_aux_multinet_critic_loss = clip_aux_multinet_critic_loss self.multipleet_upadte_clip_grad_norm = multipleet_upadte_clip_grad_norm self.summary_interval = summary_interval self.debug_no_aux_phase = debug_no_aux_phase self.update_step = 0 def normalize_state(self, state): """return normalized state """ if self.state_mean is None: return state state = (state - self.state_mean) / self.state_halfwidth return state def soft_update(self, target_net, net): """Polyark update """ for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_( self.tau * param.data + (1 - self.tau) * target_param.data ) def is_update(self, steps): """update in rollout length interval """ return steps % self.rollout_length == 0 def update(self, state = None): """Training process, according to the original paper, trainign process is follow: initialize replay buffer B 1. perform rollout N_{\pi} times \\ Policy phase 2. update multiplenet, loss = L^{clip} + Bhevior Cloning Loss update critic, loss = L^{value} (using GAE) 3. update multiplenet, loss = L^{joint} \\ Auxiliary Phase update critic, loss = L^{value} (using GAE) 4. reset B """ if not self.is_update(self.memory.index): return self.update_step += 1 # sample from replay buffer with torch.no_grad(): batch = self.memory.sample(state) action_batch = batch['actions'].to(self.device) state_batch = batch['obs'].to(self.device) reward_batch = batch['rewards'].to(self.device) terminate_batch = batch['terminates'].to(self.device) log_pis_batch = batch['log_pis'].to(self.device) values = self.critic(state_batch) # calulate value target (\hat{V}\_t^{targ}) for each state targets, advantages = util.calculate_advantage(values, reward_batch, terminate_batch, self.gamma, self.lambda_) # 2. policy phase, update multiplenet and critic # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 6-7 for i in range(self.value_epoch): indices = np.arange(self.rollout_length) np.random.shuffle(indices) for start in range(0, self.rollout_length, self.batch_size): idxes = indices[start:start + self.batch_size] loss_critic = self.update_critic(state_batch[idxes], targets[idxes]) loss_critic += loss_critic n_train_iteration = self.value_epoch * (self.rollout_length // self.batch_size) loss_critic = loss_critic / n_train_iteration self.writer.add_scalar('/critic/loss/policy_phase', loss_critic, self.update_step) # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 8-9 loss_actor, l_clip, bc_loss = 0, 0, 0 for i in range(self.policy_epoch): indices = np.arange(self.rollout_length) np.random.shuffle(indices) for start in range(0, self.rollout_length, self.batch_size): idxes = indices[start:start + self.batch_size] loss_actor, l_clip, bc_loss = self.update_MultipleNet(state_batch[idxes], action_batch[idxes], log_pis_batch[idxes], advantages[idxes]) loss_actor += loss_actor l_clip += l_clip bc_loss += bc_loss # averaging losses and writeto tensorboard n_train_iteration = self.policy_epoch * (self.rollout_length // self.batch_size) loss_actor = loss_actor / n_train_iteration l_clip = l_clip / n_train_iteration bc_loss = bc_loss / n_train_iteration self.writer.add_scalar('/multiplenet/policy_phase/actor', loss_actor, self.update_step) self.writer.add_scalar('/multiplenet/policy_phase/l_clip', l_clip, self.update_step) self.writer.add_scalar('/multiplenet/policy_phase/bc_loss', bc_loss, self.update_step) with torch.no_grad(): log_pis_old = self.multipleNet.evaluate_log_pi(state_batch[:-1], action_batch) # 3. auxialry phase, update multiplenet and critic # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 12-14 # if self.debug_no_aux_phase is True, skip this phase which should makae this code equivalant to TPO loss_critic_multi, bc_loss, loss_joint, loss_critic_aux = 0, 0, 0, 0 if (self.update_step % self.num_updates == 0) and (not self.debug_no_aux_phase): for _ in range(self.aux_num_updates): indices = np.arange(self.rollout_length) np.random.shuffle(indices) for start in range(0, self.rollout_length, self.batch_size): idxes = indices[start:start + self.batch_size] loss_critic_multi, bc_loss, loss_joint = self.update_actor_Auxiliary(state_batch[idxes], action_batch[idxes], log_pis_old[idxes], targets[idxes], advantages[idxes]) loss_critic_aux = self.update_critic_Auxiliary(state_batch[idxes], targets[idxes]) loss_critic_multi += loss_critic_multi bc_loss += bc_loss loss_joint += loss_joint loss_critic_aux += loss_critic_aux # 4. initialize replay buffer to empty # https://arxiv.org/pdf/2009.04416.pdf algorithm 1, line 2 self.memory.reset() # averaging losses and writeto tensorboard n_train_iteration = self.aux_num_updates * (self.rollout_length // self.batch_size) loss_critic_multi = loss_critic_multi / n_train_iteration bc_loss = bc_loss / n_train_iteration loss_joint = loss_joint / n_train_iteration loss_critic_aux = loss_critic_aux / n_train_iteration self.writer.add_scalar('/multiplenet/loss/auxialry_phase/critic', loss_critic_multi, self.update_step) self.writer.add_scalar('/multiplenet/loss/auxialry_phase/bc_loss', bc_loss, self.update_step) self.writer.add_scalar('/multiplenet/loss/auxialry_phase/loss_joint', loss_joint, self.update_step) self.writer.add_scalar('/critic/loss/auxialry_phase/critic', loss_critic_aux, self.update_step) self.multipleNet.eval() self.critic.eval() def update_actor_Auxiliary(self, states, actions, log_pis_old, targets, advantages): """loss = L^{joint} L^{joint} = L^{aux} + \beta_{clone} * KL(\pi_{old}, \pi_{current}) In original paper, L^{aux} = mse(v_{pi}(s_t), v_targ) \\ taks for V_{\theta_\pi} """ loss_critic = (self.multipleNet.q_forward(states) - targets).pow_(2).mean() * 0.5 if self.clip_aux_multinet_critic_loss is not None: loss_critic = torch.clamp(loss_critic, min=0, max=self.clip_aux_critic_loss) loss_critic = self.aux_critic_loss_coef * loss_critic log_pis = self.multipleNet.evaluate_log_pi(states, actions) pis_old = log_pis_old.exp_() kl_loss = (pis_old * (log_pis - log_pis_old)).mean() loss_joint = loss_critic + self.beta_clone * kl_loss self.multipleNet_optimizer.zero_grad() loss_joint.backward(retain_graph=False) self.multipleNet_optimizer.step() return loss_critic, self.beta_clone * kl_loss, loss_joint def update_critic_Auxiliary(self, states, targets): """loss = L^{value} = mse(v(s) - v_targ) """ # add * 0.5 according to https://arxiv.org/pdf/2009.04416.pdf page 2 loss_critic_aux = (self.critic(states) - targets).pow_(2).mean() * 0.5 if self.clip_aux_critic_loss is not None: loss_critic_aux = torch.clamp(loss_critic_aux, min=0, max=self.clip_aux_critic_loss) self.critic_optimizer.zero_grad() loss_critic_aux.backward(retain_graph=False) # nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() return loss_critic_aux def update_critic(self, states, targets): """loss = L^{value} = mse(v(s) - v_targ) """ # add * 0.5 according to https://arxiv.org/pdf/2009.04416.pdf page 2 loss_critic = (self.critic(states) - targets).pow_(2).mean() * 0.5 self.critic_optimizer.zero_grad() loss_critic.backward(retain_graph=False) # nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() return loss_critic def update_MultipleNet(self, states, actions, log_pis_old, advantages): """policy phase, update multiplenet. loss = L^{clip} + behavir_cloing_loss """ log_pis = self.multipleNet.evaluate_log_pi(states, actions) mean_ent = log_pis.mean() ratios = (log_pis - log_pis_old).exp_() loss_actor1 = -ratios * advantages loss_actor2 = -torch.clamp( ratios, 1.0 - self.clip_eps, 1.0 + self.clip_eps ) * advantages l_clip = torch.max(loss_actor1, loss_actor2).mean() bc_loss = self.coef_ent * mean_ent loss_actor = l_clip + bc_loss loss_actor.backward(retain_graph=False) if self.multipleet_upadte_clip_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.multipleNet.parameters(), self.multipleet_upadte_clip_grad_norm) #nn.utils.clip_grad_norm_(self.multipleNet.parameters(), self.max_grad_norm) self.multipleNet_optimizer.step() return loss_actor, l_clip, bc_loss def get_action(self, state): """select action that has maximus Q value. """ self.multipleNet.eval() if not self.is_image: state_tensor = torch.tensor(self.normalize_state(state), dtype=torch.float).view(-1, self.num_state).to(self.device) else: state_tensor = torch.tensor(state.copy() / 255., dtype=torch.float).unsqueeze(0).to(self.device) with torch.no_grad(): action, log_pis = self.multipleNet.sample(state_tensor) action = action.view(self.num_action).to('cpu').detach().numpy().copy() return action, log_pis
class Agent: def __init__(self, **config): self.config = config self.n_actions = self.config["n_actions"] self.state_shape = self.config["state_shape"] self.batch_size = self.config["batch_size"] self.gamma = self.config["gamma"] self.initial_mem_size_to_train = self.config[ "initial_mem_size_to_train"] torch.manual_seed(self.config["seed"]) if torch.cuda.is_available(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.cuda.empty_cache() torch.cuda.manual_seed(self.config["seed"]) torch.cuda.manual_seed_all(self.config["seed"]) self.device = torch.device("cuda") else: self.device = torch.device("cpu") self.memory = ReplayMemory(self.config["mem_size"], self.config["alpha"], self.config["seed"]) self.v_min = self.config["v_min"] self.v_max = self.config["v_max"] self.n_atoms = self.config["n_atoms"] self.support = torch.linspace(self.v_min, self.v_max, self.n_atoms).to(self.device) self.delta_z = (self.v_max - self.v_min) / (self.n_atoms - 1) self.offset = torch.linspace(0, (self.batch_size - 1) * self.n_atoms, self.batch_size).long() \ .unsqueeze(1).expand(self.batch_size, self.n_atoms).to(self.device) self.n_step = self.config["n_step"] self.n_step_buffer = deque(maxlen=self.n_step) self.online_model = Model(self.state_shape, self.n_actions, self.n_atoms, self.support, self.device).to(self.device) self.target_model = Model(self.state_shape, self.n_actions, self.n_atoms, self.support, self.device).to(self.device) self.hard_update_target_network() self.optimizer = Adam(self.online_model.parameters(), lr=self.config["lr"], eps=self.config["adam_eps"]) def choose_action(self, state): state = np.expand_dims(state, axis=0) state = from_numpy(state).byte().to(self.device) with torch.no_grad(): self.online_model.reset() action = self.online_model.get_q_value(state).argmax(-1) return action.item() def store(self, state, action, reward, next_state, done): """Save I/O s to store them in RAM and not to push pressure on GPU RAM """ assert state.dtype == "uint8" assert next_state.dtype == "uint8" assert isinstance(reward, int) assert isinstance(done, bool) self.n_step_buffer.append((state, action, reward, next_state, done)) if len(self.n_step_buffer) < self.n_step: return reward, next_state, done = self.get_n_step_returns() state, action, *_ = self.n_step_buffer.popleft() self.memory.add(state, np.uint8(action), reward, next_state, done) def soft_update_target_network(self, tau): for target_param, local_param in zip(self.target_model.parameters(), self.online_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) # self.target_model.train() for param in self.target_model.parameters(): param.requires_grad = False def hard_update_target_network(self): self.target_model.load_state_dict(self.online_model.state_dict()) # self.target_model.train() for param in self.target_model.parameters(): param.requires_grad = False def unpack_batch(self, batch): batch = self.config["transition"](*zip(*batch)) states = from_numpy(np.stack(batch.state)).to(self.device) actions = from_numpy(np.stack(batch.action)).to(self.device).view( (-1, 1)) rewards = from_numpy(np.stack(batch.reward)).to(self.device).view( (-1, 1)) next_states = from_numpy(np.stack(batch.next_state)).to(self.device) dones = from_numpy(np.stack(batch.done)).to(self.device).view((-1, 1)) return states, actions, rewards, next_states, dones def train(self, beta): if len(self.memory) < self.initial_mem_size_to_train: return 0, 0 # as no loss batch, weights, indices = self.memory.sample(self.batch_size, beta) states, actions, rewards, next_states, dones = self.unpack_batch(batch) weights = from_numpy(weights).float().to(self.device) with torch.no_grad(): self.online_model.reset() self.target_model.reset() q_eval_next = self.online_model.get_q_value(next_states) selected_actions = torch.argmax(q_eval_next, dim=-1) q_next = self.target_model(next_states)[range(self.batch_size), selected_actions] projected_atoms = rewards + (self.gamma** self.n_step) * self.support * (~dones) projected_atoms = projected_atoms.clamp(min=self.v_min, max=self.v_max) b = (projected_atoms - self.v_min) / self.delta_z lower_bound = b.floor().long() upper_bound = b.ceil().long() lower_bound[(upper_bound > 0) * (lower_bound == upper_bound)] -= 1 upper_bound[(lower_bound < (self.n_atoms - 1)) * (lower_bound == upper_bound)] += 1 projected_dist = torch.zeros(q_next.size(), dtype=torch.float64).to(self.device) projected_dist.view(-1).index_add_( 0, (lower_bound + self.offset).view(-1), (q_next * (upper_bound.float() - b)).view(-1)) projected_dist.view(-1).index_add_( 0, (upper_bound + self.offset).view(-1), (q_next * (b - lower_bound.float())).view(-1)) eval_dist = self.online_model(states)[range(self.batch_size), actions.squeeze().long()] dqn_loss = -(projected_dist * torch.log(eval_dist + 1e-6)).sum(-1) td_error = dqn_loss.abs() + 1e-6 self.memory.update_priorities(indices, td_error.detach().cpu().numpy()) dqn_loss = (dqn_loss * weights).mean() self.optimizer.zero_grad() dqn_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( self.online_model.parameters(), self.config["clip_grad_norm"]) self.optimizer.step() return dqn_loss.item(), grad_norm.item() def ready_to_play(self, state_dict): self.online_model.load_state_dict(state_dict) self.online_model.eval() def get_n_step_returns(self): reward, next_state, done = self.n_step_buffer[-1][-3:] for transition in reversed(list(self.n_step_buffer)[:-1]): r, n_s, d = transition[-3:] reward = r + self.gamma * reward * (1 - d) next_state, done = (n_s, d) if d else (next_state, done) return reward, next_state, done
TARGET_UPDATE = 10 learningrate = 0.001 # Creates the environment, search strategy and agent env = EnvironmentManager(device, "CarRacing-v0", actionDict) strat = EpsilonGreedyStrategy(EPS_END, EPS_END, EPS_DECAY) agent = Agent(strat, env.num_actions_available(), device) # Creates the policy and target network policy_net = DQN(env.get_screen_height(), env.get_screen_width(), env.num_actions_available(), n_latent_var).to(device) target_net = DQN(env.get_screen_height(), env.get_screen_width(), env.num_actions_available(), n_latent_var).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(policy_net.parameters(), lr=learningrate) memory = ReplayMemory(10000) InputLayer = keras.layers.Input(batch_shape=(None, 224, 224, 3)) road = keras.applications.MobileNetV2(input_tensor=InputLayer, weights=None, classes=2) Nadam = keras.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999) road.compile(optimizer=Nadam, loss='mean_squared_error', metrics=['accuracy']) road.load_weights('Unitygym.h5') print("Loaded keras weights") writer = open("DQNRoad.csv", mode="a") def runner(num_episodes, max_timestep, BATCH_SIZE, env): episodeRew = [] for i_episode in range(num_episodes): # Initialize the environment and state env.reset()
def print_play_message(card_tnr: np.ndarray): output_str = "The Network plays: " if card_tnr[1] == 0: tmp = rank_strings[3:4] + rank_strings[5:6] + rank_strings[:3] + rank_strings[4:5] + rank_strings[6:] output_str += tmp[card_tnr[0]] else: output_str += rank_strings[card_tnr[0]] output_str += suit_strings[card_tnr[1]] print(output_str) assert len(sys.argv) == 3, sys.argv pred_memory = ReplayMemory(1) strat_memory = RnnReplayMemory(1) pred_network = PredictionNetwork(keras.models.load_model(sys.argv[1]), pred_memory, batch_size=1, can_train=False) strat_network = RnnStrategyNetwork(keras.models.load_model(sys.argv[2]), strat_memory, batch_size=1, can_train=False) player = RnnPlayer(pred_network, strat_network, 0, 0) absolute_position = int(input("What is the index of the player? ")) assert 0 <= absolute_position < 4, "the given absolute position is " + str(absolute_position) player_inter = RnnPlayerInterlayer(player, sum, sum) player_inter.set_absolute_position(absolute_position) # get the trump suit trump_string = None while trump_string not in suit_strings:
class DdpgAgent: def __init__(self, observation_space, action_space, device, gamma=0.99, actor_lr=1e-4, critic_lr=1e-3, batch_size=64, memory_size=50000, tau=1e-3, weight_decay=1e-2, writer=None, is_image=False): super(DdpgAgent, self).__init__() self.num_state = observation_space.shape[0] self.num_action = action_space.shape[0] self.state_mean = None self.state_halfwidth = None if abs(observation_space.high[0]) != math.inf: self.state_mean = 0.5 * (observation_space.high + observation_space.low) self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low) self.gamma = gamma self.batch_size = batch_size self.device = device self.actor = ActorNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.actor_target = ActorNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay) self.memory = ReplayMemory(observation_space, action_space, device, num_state=self.num_state, memory_size=memory_size, is_image=is_image) self.criterion = nn.SmoothL1Loss() self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.tau = tau self.writer = writer self.update_step = 0 self.is_image = is_image def normalize_state(self, state): if self.state_mean is None: return state state = (state - self.state_mean) / self.state_halfwidth return state def soft_update(self, target_net, net): for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def update(self): self.update_step += 1 with torch.no_grad(): batch, indices, probability_distribution = self.memory.random_sample( ) #各サンプルにおける状態行動の値を取ってくる action_batch = batch['actions'].to(self.device) state_batch = batch['obs'].to(self.device) next_obs_batch = batch['next_obs'].clone().to(self.device) reward_batch = batch['rewards'].to(self.device) terminate_batch = batch['terminates'].to(self.device) next_q_value_index = self.actor_target(next_obs_batch) #target-Q-network内の、対応する行動のインデックスにおける価値関数の値を取ってくる next_q_value = self.critic_target(next_obs_batch, next_q_value_index) #目的とする値の導出 target_q_values = reward_batch + self.gamma * next_q_value * ( 1 - terminate_batch) self.actor.train() self.critic.train() q_values = self.critic(state_batch, action_batch) #誤差の計算 critic_loss = self.criterion(q_values, target_q_values) #勾配を0にリセットする self.critic_optimizer.zero_grad() #逆誤差伝搬を計算する critic_loss.backward() #勾配を更新する self.critic_optimizer.step() if self.writer and self.update_step % 1000 == 0: self.writer.add_scalar("loss/critic", critic_loss.item(), self.update_step / 1000) #print("loss/critic", critic_loss.item()) actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() if self.writer and self.update_step % 1000 == 0: self.writer.add_scalar("loss/actor", actor_loss.item(), self.update_step / 1000) #print("loss/actor", actor_loss.item()) self.soft_update(self.actor_target, self.actor) self.soft_update(self.critic_target, self.critic) self.actor.eval() self.critic.eval() # Q値が最大の行動を選択 def get_action(self, state, noise=None, timestep=0): if not self.is_image: state_tensor = torch.tensor(self.normalize_state(state), dtype=torch.float).view( -1, self.num_state).to(self.device) else: state_tensor = torch.tensor(state.copy() / 255., dtype=torch.float).unsqueeze(0).to( self.device) with torch.no_grad(): action = self.actor(state_tensor).view(self.num_action) if noise is not None: noise = noise(timestep) action = np.clip( action.to('cpu').detach().numpy().copy() + noise, -1, 1) else: action = np.clip( action.to('cpu').detach().numpy().copy(), -1, 1) return action
class PpgAgent: def __init__(self, observation_space, action_space, device, gamma=0.995, actor_lr=5e-4, critic_lr=5e-4, batch_size=128, memory_size=50000, tau=5e-3, weight_decay=1e-2, sigma=0.2, noise_clip=0.5, alpha=0.2, alpha_lr=3e-4, rollout_length=2048, lambda_=0.95, beta_clone=1.0, coef_ent=0.01, num_updates=32, policy_epoch=1, value_epoch=1, aux_num_updates=6, aux_epoch_batch=16, max_grad_norm=0.5, clip_eps=0.2, writer=None, is_image=False): super(PpgAgent, self).__init__() self.action_mean = (0.5 * (action_space.high + action_space.low))[0] self.action_halfwidth = (0.5 * (action_space.high - action_space.low))[0] self.num_state = observation_space.shape[0] self.num_action = action_space.shape[0] self.state_mean = None self.state_halfwidth = None if abs(observation_space.high[0]) != math.inf: self.state_mean = 0.5 * (observation_space.high + observation_space.low) self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low) self.gamma = gamma self.batch_size = batch_size self.device = device self.multipleNet = MultipleNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.multipleNet_target = MultipleNetwork(self.num_state, action_space, device, is_image=is_image).to( self.device) self.multipleNet_target.load_state_dict(self.multipleNet.state_dict()) self.multipleNet_optimizer = optim.Adam(self.multipleNet.parameters(), lr=actor_lr) self.critic = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image=is_image).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr, weight_decay=weight_decay) self.alpha = alpha self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr=alpha_lr) self.memory = ReplayMemory(observation_space, action_space, device, num_state=self.num_state, memory_size=memory_size, is_image=is_image) self.criterion = nn.MSELoss() self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.tau = tau self.writer = writer self.update_step = 0 self.is_image = is_image self.sigma = sigma self.noise_clip = noise_clip self.rollout_length = rollout_length self.lambda_ = lambda_ self.coef_ent = coef_ent self.max_grad_norm = max_grad_norm self.aux_num_updates = aux_num_updates self.clip_eps = clip_eps self.beta_clone = beta_clone self.policy_epoch = policy_epoch self.value_epoch = value_epoch self.num_updates = num_updates self.aux_epoch_batch = aux_epoch_batch def normalize_state(self, state): if self.state_mean is None: return state state = (state - self.state_mean) / self.state_halfwidth return state def soft_update(self, target_net, net): for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def is_update(self, steps): return steps % self.rollout_length == 0 def update(self, state=None): if not self.is_update(self.memory.index): return self.update_step += 1 with torch.no_grad(): batch = self.memory.sample(state) #各サンプルにおける状態行動の値を取ってくる action_batch = batch['actions'].to(self.device) state_batch = batch['obs'].to(self.device) reward_batch = batch['rewards'].to(self.device) terminate_batch = batch['terminates'].to(self.device) log_pis_batch = batch['log_pis'].to(self.device) values = self.critic(state_batch) targets, advantages = util.calculate_advantage(values, reward_batch, terminate_batch, self.gamma, self.lambda_) for j in range(self.num_updates): for i in range(max(self.policy_epoch, self.value_epoch)): indices = np.arange(self.rollout_length) np.random.shuffle(indices) for start in range(0, self.rollout_length, self.batch_size): idxes = indices[start:start + self.batch_size] if self.policy_epoch > i: self.update_MultipleNet(state_batch[idxes], action_batch[idxes], log_pis_batch[idxes], advantages[idxes]) if self.value_epoch > i: self.update_critic(state_batch[idxes], targets[idxes]) with torch.no_grad(): log_pis_old = self.multipleNet.evaluate_log_pi( state_batch[:-1], action_batch) for _ in range(self.aux_num_updates): indices = np.arange(self.rollout_length) np.random.shuffle(indices) for start in range(0, self.rollout_length, self.aux_epoch_batch): idxes = indices[start:start + self.aux_epoch_batch] self.update_actor_Auxiliary(state_batch[idxes], action_batch[idxes], log_pis_old[idxes], targets[idxes], advantages[idxes]) self.update_critic_Auxiliary(state_batch[idxes], targets[idxes]) self.multipleNet.eval() self.critic.eval() def update_actor_Auxiliary(self, states, actions, log_pis_old, targets, advantages): loss_critic = (self.multipleNet.q_forward(states) - targets).pow_(2).mean() loss_bc = (self.multipleNet.p_forward(states) - actions).pow_(2).mean() log_pis = self.multipleNet.evaluate_log_pi(states, actions) pis_old = log_pis_old.exp_() kl_loss = (pis_old * (log_pis - log_pis_old)).mean() loss_joint = loss_critic + self.beta_clone * kl_loss self.multipleNet_optimizer.zero_grad() loss_joint.backward(retain_graph=False) self.multipleNet_optimizer.step() if self.update_step % 10 == 0: print("aux actor loss:", loss_joint.item()) def update_critic_Auxiliary(self, states, targets): loss_critic_aux = (self.critic(states) - targets).pow_(2).mean() self.critic_optimizer.zero_grad() loss_critic_aux.backward(retain_graph=False) #nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() #if self.update_step % 50 == 0: # print("aux citic loss:", loss_critic_aux.item()) def update_critic(self, states, targets): loss_critic = (self.critic(states) - targets).pow_(2).mean() self.critic_optimizer.zero_grad() loss_critic.backward(retain_graph=False) #nn.utils.clip_grad_norm_(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step() #if self.update_step % 50 == 0: # print("citic loss:", loss_critic.item()) def update_MultipleNet(self, states, actions, log_pis_old, advantages): log_pis = self.multipleNet.evaluate_log_pi(states, actions) if self.update_step % 50 == 0: print("log_pis:", log_pis) mean_ent = -log_pis.mean() ratios = (log_pis - log_pis_old).exp_() loss_actor1 = -ratios * advantages loss_actor2 = -torch.clamp(ratios, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * advantages loss_actor = torch.max(loss_actor1, loss_actor2).mean() - self.coef_ent * mean_ent self.multipleNet_optimizer.zero_grad() loss_actor.backward(retain_graph=False) #nn.utils.clip_grad_norm_(self.multipleNet.parameters(), self.max_grad_norm) self.multipleNet_optimizer.step() if self.update_step % 50 == 0: print("actor loss:", loss_actor.item()) # Q値が最大の行動を選択 def get_action(self, state): self.multipleNet.eval() if not self.is_image: state_tensor = torch.tensor(self.normalize_state(state), dtype=torch.float).view( -1, self.num_state).to(self.device) else: state_tensor = torch.tensor(state.copy() / 255., dtype=torch.float).unsqueeze(0).to( self.device) with torch.no_grad(): action, log_pis = self.multipleNet.sample(state_tensor) action = action.view( self.num_action).to('cpu').detach().numpy().copy() return action, log_pis