def __init__(self, agent, game, buffer_file=None, weights_file=None, n_batches=0): self.agent = agent self.game = game self.replay_buffer = ReplayBuffer() if buffer_file is not None: self.replay_buffer.buffer = pickle.load(open(buffer_file, "rb")) self.current_network = NestedTTTNet() self.control_network = NestedTTTNet() if weights_file is not None: self.control_network.load_state_dict(torch.load(weights_file)) self.current_network.load_state_dict(self.control_network.state_dict()) self.control_network.eval() self.current_network.train() self.agent.update_control_net(self.control_network) self.n_batches = n_batches self.optim = torch.optim.Adam(self.current_network.parameters(), lr=.01, weight_decay=10e-4)
def __init__(self, state_size, action_size, num_agents): self.state_size = state_size self.action_size = action_size self.seed = random.seed(RANDOM_SEED) self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Directory where to save the model self.model_dir = os.getcwd() + "/DDPG/saved_models" os.makedirs(self.model_dir, exist_ok=True)
def __init__( self, env, gamma=0.99, polyak=0.995, act_noise=0.1, render=False, batch_size=32, q_lr=1e-3, p_lr=1e-4, d=2, buffer_capacity=5000, max_episodes=100, save_path=None, load_path=None, print_freq=1, start_steps=10000, log_dir='logs/train', training=True, ): self.gamma = gamma self.polyak = polyak self.act_noise = act_noise self.render = render self.batch_size = batch_size self.p_lr = p_lr self.q_lr = q_lr self.d = d self.max_episodes = max_episodes self.start_steps = start_steps self.actor, self.critic_1, self.critic_2 = create_actor_critic( env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high) self.target_actor, self.target_critic_1, self.target_critic_2 = create_actor_critic( env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic_1.set_weights(self.critic_1.get_weights()) self.target_critic_2.set_weights(self.critic_2.get_weights()) self.env = env self.rewards = [] self.print_freq = print_freq self.save_path = save_path if training: self.buffer = ReplayBuffer(buffer_capacity) self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=self.p_lr) self.critic_1_optimizer = tf.keras.optimizers.Adam( learning_rate=self.q_lr) self.critic_2_optimizer = tf.keras.optimizers.Adam( learning_rate=self.q_lr) self.summary_writer = tf.summary.create_file_writer(log_dir) self.mse = tf.keras.losses.MeanSquaredError() if load_path is not None: self.actor.load_weights(f'{load_path}/actor') self.critic_1.load_weights(f'{load_path}/critic_1') self.critic_2.load_weights(f'{load_path}/critic_2')
def __init__(self, input_dims, n_actions, layer_sizes, act_lr=0.00001, crt_lr=0.0001, tau=0.001, gamma=0.99, max_size=1000000, batch_size=64, chkpt_dir='tmp/ddpg', name='ddpg', layerNorm=True): self.input_dims = input_dims self.n_actions = n_actions self.layer_sizes = layer_sizes self.layerNorm = layerNorm self.gamma = gamma # discount factor self.tau = tau # target network updating weight self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions) self.batch_size = batch_size self.actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, self.layer_sizes, name='Actor_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.critic = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='Critic_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.target_actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, self.layer_sizes, name='TargetActor_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.target_critic = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='TargetCritic_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.noise = OUActionNoise(mu=np.zeros(self.n_actions)) self.update_network_parameters(tau=1)
def initialize(self, args): BaseModel.initialize(self, args) self.input_B = self.Tensor(args.batchSize, 3, 1024, 256) self.input_C = self.Tensor(args.batchSize, 1, 1024, 256) self.fake_Buffer = ReplayBuffer() self.netG_BtoC = networks.define_G(3, 1, 64, 'unet_128', 'batch', False, args.init_type, self.gpu_ids) self.netD_C = networks.define_D(1, 64, 'basic', norm='batch', use_sigmoid=False, gpu_ids=args.gpu_ids) self.netG_BtoC.apply(weights_init_normal) self.netD_C.apply(weights_init_normal) checkpoint_BtoC_filename = 'netG_B2C.pth' checkpoint_D_C_filename = 'netD_C.pth' checkpoint_path_BtoC = os.path.join(args.checkpoints_dir, checkpoint_BtoC_filename) checkpoint_path_D_C = os.path.join(args.checkpoints_dir, checkpoint_D_C_filename) # Load checkpoint # self.netG_BtoC.load_state_dict(torch.load(checkpoint_path_BtoC)) # self.netD_C.load_state_dict(torch.load(checkpoint_path_D_C)) # define loss self.criterionGAN = torch.nn.MSELoss() self.criterionReconstruction = torch.nn.L1Loss().cuda() # init optimizer self.optimizer_G = torch.optim.Adam(self.netG_BtoC.parameters(), lr=0.0002, betas=(0.5, 0.999)) self.optimizer_D = torch.optim.Adam(self.netD_C.parameters(), lr=0.0002, betas=(0.5, 0.999)) self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR( self.optimizer_G, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) self.lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR( self.optimizer_D, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step)
def __init__(self, gamma=0.999, buffer_size=1e5, batch_size=1024, episodes_nr=50000, tau=2e-2, gym_name='MountainCarContinuous-v0'): self.lr_actor = 5e-3 # learning rate for the actor self.lr_critic = 1e-3 # learning rate for the critic self.lr_decay = 1 # learning rate decay (per episode) self.l2_reg_actor = 1e-7 # L2 regularization factor for the actor self.l2_reg_critic = 1e-7 # L2 regularization factor for the critic self.num_episodes = episodes_nr # number of episodes self.max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit) self.train_every = 1 # number of steps to run the policy (and collect experience) before updating network weights self.replay_memory_capacity = buffer_size # capacity of experience replay memory self.batch_size = batch_size self.memory = ReplayBuffer(int(buffer_size)) self.episodes_nr = episodes_nr self.gamma = gamma self.tau = tau self.env = gym.make(gym_name) assert(self.env.action_space.high == -self.env.action_space.low) self.action_range = self.env.action_space.high[0] self.action_dim = np.prod(np.array(self.env.action_space.shape)) self.state_dim = np.prod(np.array(self.env.observation_space.shape)) #self.noise = OUNoise(self.action_dim) self.action_range = self.env.action_space.high - self.env.action_space.low self.initial_noise_scale = 0.1 # scale of the exploration noise process (1.0 is the range of each action dimension) self.noise_decay = 1 #0.99 # decay rate (per episode) of the scale of the exploration noise process self.exploration_mu = 0.0 # mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt self.exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt self.exploration_sigma = 0.2 # sigma parameter for the exploration noise process: dXt = theta*(mu-Xt )*dt + sigma*dWt self.noise = OUNoise(self.action_dim)
def __init__(self, task): self.lr_actor = 5e-3 # learning rate for the actor self.lr_critic = 1e-3 # learning rate for the critic #self.lr_decay = 1 # learning rate decay (per episode) self.l2_reg_actor = 1e-7 # L2 regularization factor for the actor self.l2_reg_critic = 1e-7 # L2 regularization factor for the critic #self.num_episodes = 2000 # number of episodes #self.max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit) self.batch_size = 1024 self.memory = ReplayBuffer(int(1e5)) #self.episodes_nr = 10000 self.gamma = 0.999 self.tau = 2e-2
def __init__(self, env, gamma=0.99, polyak=0.995, c=10, d=2, high_act_noise=0.1, low_act_noise=0.1, high_rew_scale=0.1, low_rew_scale=1.0, render=False, batch_size=32, q_lr=1e-3, p_lr=1e-4, buffer_capacity=5000, max_episodes=100, save_path=None, load_path=None, print_freq=1, log_dir='logs/train', training=True ): self.gamma = gamma self.polyak = polyak self.low_act_noise = low_act_noise self.high_act_noise = high_act_noise self.low_rew_scale = low_rew_scale self.high_rew_scale = high_rew_scale self.render = render self.batch_size = batch_size self.p_lr = p_lr self.q_lr = q_lr self.max_episodes = max_episodes self.env = env self.rewards = [] self.print_freq = print_freq self.save_path = save_path self.c = c self.d = d self.higher_buffer = ReplayBuffer(buffer_capacity, tuple_length=5) self.lower_buffer = ReplayBuffer(buffer_capacity, tuple_length=4) self.low_actor, self.low_critic_1, self.low_critic_2 = create_actor_critic( state_dim=2 * env.observation_space.shape[0], action_dim=env.action_space.shape[0], action_range=env.action_space.high) self.low_target_actor, self.low_target_critic_1, self.low_target_critic_2 = create_actor_critic( state_dim=2 * env.observation_space.shape[0], action_dim=env.action_space.shape[0], action_range=env.action_space.high) self.high_actor, self.high_critic_1, self.high_critic_2 = create_actor_critic( state_dim=env.observation_space.shape[0], action_dim=env.observation_space.shape[0], action_range=env.observation_space.high) self.high_target_actor, self.high_target_critic_1, self.high_target_critic_2 = create_actor_critic( state_dim=env.observation_space.shape[0], action_dim=env.observation_space.shape[0], action_range=env.observation_space.high) self.low_target_actor.set_weights(self.low_actor.get_weights()) self.low_target_critic_1.set_weights(self.low_critic_1.get_weights()) self.low_target_critic_2.set_weights(self.low_critic_2.get_weights()) self.high_target_actor.set_weights(self.high_actor.get_weights()) self.high_target_critic_1.set_weights(self.high_critic_1.get_weights()) self.high_target_critic_2.set_weights(self.high_critic_2.get_weights()) if training: self.low_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr) self.low_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.low_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.high_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr) self.high_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.high_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.mse = tf.keras.losses.MeanSquaredError() self.summary_writer = tf.summary.create_file_writer(log_dir) self.low_actor_train_fn = self.create_train_step_actor_fn(self.low_actor, self.low_critic_1, self.low_actor_optimizer) self.low_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in [(self.low_critic_1, self.low_critic_1_optimizer), (self.low_critic_2, self.low_critic_2_optimizer)]] self.high_actor_train_fn = self.create_train_step_actor_fn(self.high_actor, self.high_critic_1, self.high_actor_optimizer) self.high_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in [(self.high_critic_1, self.high_critic_1_optimizer), (self.high_critic_2, self.high_critic_2_optimizer)]] if load_path is not None: self.low_actor.load_weights(f'{load_path}/low/actor') self.low_critic_1.load_weights(f'{load_path}/low/critic_1') self.low_critic_2.load_weights(f'{load_path}/low/critic_2') self.high_actor.load_weights(f'{load_path}/high/actor') self.high_critic_1.load_weights(f'{load_path}/high/critic_1') self.high_critic_2.load_weights(f'{load_path}/high/critic_2')
opt.n_epochs, 0, opt.decay_epoch).step) lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR(optimizer_D_B, lr_lambda=LambdaLR( opt.n_epochs, 0, opt.decay_epoch).step) # Data dataA = np.random.randn(shape=(3, 256, 256)).astype(np.float32) dataB = np.random.randn(shape=(3, 256, 256)).astype(np.float32) train = torch.utils.data.TensorDataset(torch.from_numpy(dataA), torch.from_numpy(dataB)) data_loader = torch.utils.data.DataLoader(train, batch_size=opt.batchSize, shuffle=True) buffer_A = ReplayBuffer() buffer_B = ReplayBuffer() # targets real_target = Tensor(opt.batchSize).fill_(1.0) fake_target = Tensor(opt.batchSize).fill_(0.0) # Parameters lamnda_identity = 5.0 lambda_cycle = 10.0 for epoch in range(opt.n_epochs): for i, data in enumerate(data_loader): realA, realB = data ### Generator
class HIRO: def __init__(self, env, gamma=0.99, polyak=0.995, c=10, d=2, high_act_noise=0.1, low_act_noise=0.1, high_rew_scale=0.1, low_rew_scale=1.0, render=False, batch_size=32, q_lr=1e-3, p_lr=1e-4, buffer_capacity=5000, max_episodes=100, save_path=None, load_path=None, print_freq=1, log_dir='logs/train', training=True ): self.gamma = gamma self.polyak = polyak self.low_act_noise = low_act_noise self.high_act_noise = high_act_noise self.low_rew_scale = low_rew_scale self.high_rew_scale = high_rew_scale self.render = render self.batch_size = batch_size self.p_lr = p_lr self.q_lr = q_lr self.max_episodes = max_episodes self.env = env self.rewards = [] self.print_freq = print_freq self.save_path = save_path self.c = c self.d = d self.higher_buffer = ReplayBuffer(buffer_capacity, tuple_length=5) self.lower_buffer = ReplayBuffer(buffer_capacity, tuple_length=4) self.low_actor, self.low_critic_1, self.low_critic_2 = create_actor_critic( state_dim=2 * env.observation_space.shape[0], action_dim=env.action_space.shape[0], action_range=env.action_space.high) self.low_target_actor, self.low_target_critic_1, self.low_target_critic_2 = create_actor_critic( state_dim=2 * env.observation_space.shape[0], action_dim=env.action_space.shape[0], action_range=env.action_space.high) self.high_actor, self.high_critic_1, self.high_critic_2 = create_actor_critic( state_dim=env.observation_space.shape[0], action_dim=env.observation_space.shape[0], action_range=env.observation_space.high) self.high_target_actor, self.high_target_critic_1, self.high_target_critic_2 = create_actor_critic( state_dim=env.observation_space.shape[0], action_dim=env.observation_space.shape[0], action_range=env.observation_space.high) self.low_target_actor.set_weights(self.low_actor.get_weights()) self.low_target_critic_1.set_weights(self.low_critic_1.get_weights()) self.low_target_critic_2.set_weights(self.low_critic_2.get_weights()) self.high_target_actor.set_weights(self.high_actor.get_weights()) self.high_target_critic_1.set_weights(self.high_critic_1.get_weights()) self.high_target_critic_2.set_weights(self.high_critic_2.get_weights()) if training: self.low_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr) self.low_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.low_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.high_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr) self.high_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.high_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.mse = tf.keras.losses.MeanSquaredError() self.summary_writer = tf.summary.create_file_writer(log_dir) self.low_actor_train_fn = self.create_train_step_actor_fn(self.low_actor, self.low_critic_1, self.low_actor_optimizer) self.low_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in [(self.low_critic_1, self.low_critic_1_optimizer), (self.low_critic_2, self.low_critic_2_optimizer)]] self.high_actor_train_fn = self.create_train_step_actor_fn(self.high_actor, self.high_critic_1, self.high_actor_optimizer) self.high_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in [(self.high_critic_1, self.high_critic_1_optimizer), (self.high_critic_2, self.high_critic_2_optimizer)]] if load_path is not None: self.low_actor.load_weights(f'{load_path}/low/actor') self.low_critic_1.load_weights(f'{load_path}/low/critic_1') self.low_critic_2.load_weights(f'{load_path}/low/critic_2') self.high_actor.load_weights(f'{load_path}/high/actor') self.high_critic_1.load_weights(f'{load_path}/high/critic_1') self.high_critic_2.load_weights(f'{load_path}/high/critic_2') @staticmethod def goal_transition(state, goal, next_state): return state + goal - next_state @staticmethod def intrinsic_reward(state, goal, next_state): return - np.linalg.norm(state + goal - next_state) def act(self, obs, goal, noise=False): norm_dist = tf.random.normal(self.env.action_space.shape, stddev=0.1 * self.env.action_space.high) action = self.low_actor(np.concatenate((obs, goal), axis=1)).numpy() action = np.clip(action + (norm_dist.numpy() if noise else 0), a_min=self.env.action_space.low, a_max=self.env.action_space.high) return action def get_goal(self, obs, noise=False): norm_dist = tf.random.normal(self.env.observation_space.shape, stddev=0.1 * self.env.observation_space.high) action = self.high_actor(obs).numpy() action = np.clip(action + (norm_dist.numpy() if noise else 0), a_min=self.env.observation_space.low, a_max=self.env.observation_space.high) return action @tf.function def log_probability(self, states, actions, candidate_goal): goals = tf.reshape(candidate_goal, (1, -1)) def body(curr_i, curr_goals, s): new_goals = tf.concat( (curr_goals, tf.reshape(self.goal_transition(s[curr_i - 1], curr_goals[curr_i - 1], s[curr_i]), (1, -1))), axis=0) curr_i += 1 return [curr_i, new_goals, s] def condition(curr_i, curr_goals, s): return curr_i < s.shape[0] and not ( tf.equal(tf.math.count_nonzero(s[curr_i]), 0) and tf.equal(tf.math.count_nonzero(actions[curr_i]), 0)) # If a state-action pair is all zero, then the episode ended before an entire sequence of length c was recorded. # We must remove these empty states and actions from the log probability calculation, as they could skew the # argmax computation i = tf.constant(1) i, goals, states = tf.while_loop(condition, body, [i, goals, states], shape_invariants=[tf.TensorShape(None), tf.TensorShape([None, goals.shape[1]]), states.shape]) states = states[:i, :] actions = actions[:i, :] action_predictions = self.low_actor(tf.concat((states, goals), axis=1)) return -(1 / 2) * tf.reduce_sum(tf.linalg.norm(actions - action_predictions, axis=1)) @tf.function def off_policy_correct(self, states, goals, actions, new_states): first_states = tf.reshape(states, (self.batch_size, -1))[:, :new_states[0].shape[0]] means = new_states - first_states std_dev = 0.5 * (1 / 2) * tf.convert_to_tensor(self.env.observation_space.high) for i in range(states.shape[0]): # Sample eight candidate goals sampled randomly from a Gaussian centered at s_{t+c} - s_t # Include the original goal and a goal corresponding to the difference s_{t+c} - s_t # TODO: clip the random actions to lie within the high-level action range candidate_goals = tf.concat( (tf.random.normal(shape=(8, self.env.observation_space.shape[0]), mean=means[i], stddev=std_dev), tf.reshape(goals[i], (1, -1)), tf.reshape(means[i], (1, -1))), axis=0) chosen_goal = tf.argmax( [self.log_probability(states[i], actions[i], candidate_goals[g]) for g in range(candidate_goals.shape[0])]) goals = tf.tensor_scatter_nd_update(goals, [[i]], [candidate_goals[chosen_goal]]) return first_states, goals @tf.function def train_step_critics(self, states, actions, rewards, next_states, actor, target_critic_1, target_critic_2, critic_trains_fns, target_noise, scope='Policy'): target_goal_preds = actor(next_states) target_goal_preds += target_noise target_q_values_1 = target_critic_1([next_states, target_goal_preds]) target_q_values_2 = target_critic_2([next_states, target_goal_preds]) target_q_values = tf.concat((target_q_values_1, target_q_values_2), axis=1) target_q_values = tf.reshape(tf.reduce_min(target_q_values, axis=1), (self.batch_size, -1)) targets = rewards + self.gamma * target_q_values critic_trains_fns[0](states, actions, targets, scope=scope, label='Critic 1') critic_trains_fns[1](states, actions, targets, scope=scope, label='Critic 2') def create_train_step_actor_fn(self, actor, critic, optimizer): @tf.function def train_step_actor(states, scope='policy', label='actor'): with tf.GradientTape() as tape: action_predictions = actor(states) q_values = critic([states, action_predictions]) policy_loss = -tf.reduce_mean(q_values) gradients = tape.gradient(policy_loss, actor.trainable_variables) optimizer.apply_gradients(zip(gradients, actor.trainable_variables)) with tf.name_scope(scope): with self.summary_writer.as_default(): tf.summary.scalar(f'{label} Policy Loss', policy_loss, step=optimizer.iterations) return train_step_actor def create_train_step_critic_fn(self, critic, optimizer): @tf.function def train_step_critic(states, actions, targets, scope='Policy', label='Critic'): with tf.GradientTape() as tape: q_values = critic([states, actions]) mse_loss = self.mse(q_values, targets) gradients = tape.gradient(mse_loss, critic.trainable_variables) optimizer.apply_gradients(zip(gradients, critic.trainable_variables)) with tf.name_scope(scope): with self.summary_writer.as_default(): tf.summary.scalar(f'{label} MSE Loss', mse_loss, step=optimizer.iterations) tf.summary.scalar(f'{label} Mean Q Values', tf.reduce_mean(q_values), step=optimizer.iterations) return train_step_critic def update_lower(self): if len(self.lower_buffer) >= self.batch_size: states, actions, rewards, next_states = self.lower_buffer.sample(self.batch_size) rewards = rewards.reshape(-1, 1).astype(np.float32) self.train_step_critics(states, actions, rewards, next_states, self.low_actor, self.low_target_critic_1, self.low_target_critic_2, self.low_critic_train_fns, target_noise=tf.random.normal(actions.shape, stddev=0.1 * self.env.action_space.high), scope='Lower_Policy') if self.low_critic_1_optimizer.iterations % self.d == 0: self.low_actor_train_fn(states, scope='Lower_Policy', label='Actor') # Update target networks polyak_average(self.low_actor.variables, self.low_target_actor.variables, self.polyak) polyak_average(self.low_critic_1.variables, self.low_target_critic_1.variables, self.polyak) polyak_average(self.low_critic_2.variables, self.low_target_critic_2.variables, self.polyak) def update_higher(self): if len(self.higher_buffer) >= self.batch_size: states, goals, actions, rewards, next_states = self.higher_buffer.sample(self.batch_size) rewards = rewards.reshape((-1, 1)) states, goals, actions, rewards, next_states = (tf.convert_to_tensor(states, dtype=tf.float32), tf.convert_to_tensor(goals, dtype=tf.float32), tf.convert_to_tensor(actions, dtype=tf.float32), tf.convert_to_tensor(rewards, dtype=tf.float32), tf.convert_to_tensor(next_states, dtype=tf.float32)) states, goals = self.off_policy_correct(states=states, goals=goals, actions=actions, new_states=next_states) self.train_step_critics(states, goals, rewards, next_states, self.high_actor, self.high_target_critic_1, self.high_target_critic_2, self.high_critic_train_fns, target_noise=tf.random.normal(next_states.shape, stddev=0.1 * self.env.observation_space.high), scope='Higher_Policy') if self.high_critic_1_optimizer.iterations % self.d == 0: self.high_actor_train_fn(states, scope='Higher_Policy', label='Actor') # Update target networks polyak_average(self.high_actor.variables, self.high_target_actor.variables, self.polyak) polyak_average(self.high_critic_1.variables, self.high_target_critic_1.variables, self.polyak) polyak_average(self.high_critic_2.variables, self.high_target_critic_2.variables, self.polyak) def learn(self): # Collect experiences s_t, g_t, a_t, R_t mean_reward = None total_steps = 0 for ep in range(self.max_episodes): if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print(f"-------------------------------------------------------") print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}") print(f"Total Episodes: {ep}") print(f"Total Steps: {total_steps}") print(f"-------------------------------------------------------") total_steps = 0 with tf.name_scope('Episodic Information'): with self.summary_writer.as_default(): tf.summary.scalar(f'Mean {self.print_freq} Episode Reward', new_mean_reward, step=ep // self.print_freq) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}") print(f'Location: {self.save_path}') mean_reward = new_mean_reward self.low_actor.save_weights(f'{self.save_path}/low/actor') self.low_critic_1.save_weights(f'{self.save_path}/low/critic_1') self.low_critic_2.save_weights(f'{self.save_path}/low/critic_2') self.high_actor.save_weights(f'{self.save_path}/high/actor') self.high_critic_1.save_weights(f'{self.save_path}/high/critic_1') self.high_critic_2.save_weights(f'{self.save_path}/high/critic_2') obs = self.env.reset() goal = self.get_goal(obs.reshape((1, -1)), noise=True).flatten() higher_goal = goal higher_obs = [] higher_actions = [] higher_reward = 0 episode_reward = 0 episode_intrinsic_rewards = 0 ep_len = 0 c = 0 done = False while not done: if self.render: self.env.render() action = self.act(obs.reshape((1, -1)), goal.reshape((1, -1)), noise=True).flatten() new_obs, rew, done, info = self.env.step(action) new_obs = new_obs.flatten() new_goal = self.goal_transition(obs, goal, new_obs) episode_reward += rew # Goals are treated as additional state information for the low level # policy. Store transitions in respective replay buffers intrinsic_reward = self.intrinsic_reward(obs, goal, new_obs) * self.low_rew_scale self.lower_buffer.add((np.concatenate((obs, goal)), action, intrinsic_reward, np.concatenate((new_obs, new_goal)),)) episode_intrinsic_rewards += intrinsic_reward self.update_lower() # Fill lists for single higher level transition higher_obs.append(obs) higher_actions.append(action) higher_reward += self.high_rew_scale * rew # Only add transitions to the high level replay buffer every c steps c += 1 if c == self.c or done: # Need all higher level transitions to be the same length # fill the rest of this transition with zeros while c < self.c: higher_obs.append(np.full(self.env.observation_space.shape, 0)) higher_actions.append(np.full(self.env.action_space.shape, 0)) c += 1 self.higher_buffer.add((higher_obs, higher_goal, higher_actions, higher_reward, new_obs)) self.update_higher() c = 0 higher_obs = [] higher_actions = [] higher_reward = 0 goal = self.get_goal(new_obs.reshape((1, -1)), noise=True).flatten() higher_goal = goal obs = new_obs goal = new_goal with tf.name_scope('Episodic Information'): with self.summary_writer.as_default(): tf.summary.scalar(f'Episode Environment Reward', episode_reward, step=ep) tf.summary.scalar(f'Episode Intrinsic Reward', episode_intrinsic_rewards, step=ep) self.rewards.append(episode_reward) total_steps += ep_len
class SelfPlayTrainer: def __init__(self, agent, game, buffer_file=None, weights_file=None, n_batches=0): self.agent = agent self.game = game self.replay_buffer = ReplayBuffer() if buffer_file is not None: self.replay_buffer.buffer = pickle.load(open(buffer_file, "rb")) self.current_network = NestedTTTNet() self.control_network = NestedTTTNet() if weights_file is not None: self.control_network.load_state_dict(torch.load(weights_file)) self.current_network.load_state_dict(self.control_network.state_dict()) self.control_network.eval() self.current_network.train() self.agent.update_control_net(self.control_network) self.n_batches = n_batches self.optim = torch.optim.Adam(self.current_network.parameters(), lr=.01, weight_decay=10e-4) def generate_self_play_data(self, n_games=100): for _ in range(n_games): turn_num = 0 self.game.reset() self.agent.reset() result = 0 player_num = 0 states = [] move_vectors = [] while len(self.game.get_valid_moves()) > 0: move, move_probs = self.agent.search(self.game.copy(), turn_num, allotted_playouts=400) states.append(self.game.state.tolist()) move_vectors.append(move_probs) result = self.game.make_move(move) if not result: self.game.switch_player() self.agent.take_action(move) turn_num += 1 player_num = (player_num + 1) % 2 if not result: self.replay_buffer.extend( list(zip(states, move_vectors, zero_gen()))) else: self.replay_buffer.extend( list( zip(states[::-1], move_vectors[::-1], one_neg_one_gen()))[::-1]) def compare_control_to_train(self): self.current_network.eval() old_agent = AlphaMCTSAgent(control_net=self.control_network) new_agent = AlphaMCTSAgent(control_net=self.current_network) agents = [old_agent, new_agent] wins = 0 ties = 0 game = self.game.copy() for game_num in range(100): game.reset() agents[0].reset() agents[1].reset() result = 0 player_num = game_num // 50 #Both take first turn 50 times turn_num = 100 #Turn down the temperature while len(game.get_valid_moves()) > 0: move, _ = agents[player_num].search(game.copy(), turn_num, allotted_playouts=800) _, _ = agents[1 - player_num].search(game.copy(), turn_num, allotted_playouts=800) result = game.make_move(move) if not result: game.switch_player() agents[0].take_action(move) agents[1].take_action(move) player_num = (player_num + 1) % 2 if not result: ties += 1 elif result and player_num == 1: wins += 1 print("After {} games, {} wins and {} ties".format( game_num + 1, wins, ties)) if wins + .5 * ties >= 55: print( "Challenger network won {} games and tied {} games; it becomes new control network" .format(wins, ties)) torch.save(self.current_network.state_dict(), "control_weights_{}.pth".format(self.n_batches)) self.control_network.load_state_dict( self.current_network.state_dict()) else: print( "Challenger network not sufficiently better; {} wins and {} ties" .format(wins, ties)) self.control_network.eval() self.current_network.train() def train_on_batch(self, batch_size=32): if len(self.replay_buffer) < batch_size: return self.current_network.train() sample = self.replay_buffer.sample(batch_size) states, probs, rewards = zip(*sample) states = torch.FloatTensor(states).requires_grad_(True) probs = torch.FloatTensor(probs).requires_grad_(True) rewards = torch.FloatTensor(rewards).unsqueeze(1).requires_grad_(True) self.optim.zero_grad() ps, vs = self.current_network(states) loss = torch.nn.functional.mse_loss( vs, rewards) - (ps.log() * probs).sum() loss.backward() self.optim.step() self.n_batches += 1 return loss.item() def run(self, total_runs=10, self_play_games=100, training_batches=200, batch_size=32): losses = [] for run_num in range(1, total_runs + 1): print("Run {} of {}".format(run_num, total_runs)) for selfplay_num in range(1, self_play_games + 1): self.generate_self_play_data(1) print("\tFinished self-play game {} of {} (Buffer size {})". format(selfplay_num, self_play_games, len(self.replay_buffer))) print("Finished {} self-play games".format(self_play_games)) for _ in range(training_batches): losses.append(self.train_on_batch(batch_size)) if len(losses) == 5: print("\tLoss for last 5 batches: {}".format(sum(losses))) losses = [] self.compare_control_to_train()
class Agent(object): def __init__(self, input_dims, n_actions, layer_sizes, act_lr=0.00001, crt_lr=0.0001, tau=0.001, gamma=0.99, max_size=1000000, batch_size=64, chkpt_dir='tmp/ddpg', name='ddpg', layerNorm=True): self.input_dims = input_dims self.n_actions = n_actions self.layer_sizes = layer_sizes self.layerNorm = layerNorm self.gamma = gamma # discount factor self.tau = tau # target network updating weight self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions) self.batch_size = batch_size self.actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, self.layer_sizes, name='Actor_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.critic = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='Critic_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.target_actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, self.layer_sizes, name='TargetActor_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.target_critic = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='TargetCritic_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.noise = OUActionNoise(mu=np.zeros(self.n_actions)) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = T.tensor(observation, dtype=T.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) mu_prime = mu + T.tensor(self.noise() * 0.05, dtype=T.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.critic.device) # done = T.tensor(done).to(self.critic.device) new_state = T.tensor(new_state, dtype=T.float).to(self.critic.device) action = T.tensor(action, dtype=T.float).to(self.critic.device) state = T.tensor(state, dtype=T.float).to(self.critic.device) # calculate target self.target_actor.eval() self.target_critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions).view(-1) # critic_value_[done] = 0.0 # In building context, terminal state does not have value of 0 target = reward + self.gamma * critic_value_ # train critic self.critic.train() self.critic.optimizer.zero_grad() critic_value = self.critic.forward(state, action).view(-1) critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() # train actor self.critic.eval() self.actor.train() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) actor_loss = -self.critic.forward(state, mu) actor_loss = T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() return critic_loss.item(), actor_loss.item() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau updated_actor = update_single_target_network_parameters( self.actor, self.target_actor, tau) updated_critic = update_single_target_network_parameters( self.critic, self.target_critic, tau) self.target_actor.load_state_dict(updated_actor) self.target_critic.load_state_dict(updated_critic) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() # self.target_actor.save_checkpoint(modelName) self.critic.save_checkpoint() # self.target_critic.save_checkpoint(modelName) def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() # self.target_actor.load_checkpoint(modelName) self.critic.load_checkpoint()
class Agent(): def __init__(self, input_dims, n_actions, layer_sizes, act_lr=0.00001, crt_lr=0.0001, tau=0.001, gamma=0.99, max_size=1000000, batch_size=64, update_actor_interval=2, noise=0.1, noise_targetAct=0.2, chkpt_dir='tmp/td3', name='td3', layerNorm=True): self.input_dims = input_dims self.n_actions = n_actions self.gamma = gamma self.tau = tau self.max_action = 1 self.min_action = -1 self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, layer_sizes, name='Actor_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.critic_1 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='Critic1_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.critic_2 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='Critic2_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.target_actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, layer_sizes, name='TargetActor_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.target_critic_1 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='TargetCritic1_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.target_critic_2 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='TargetCritic2_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.noise = noise self.noise_targetAct = noise_targetAct self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.tensor(observation, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), dtype=T.float).to(self.actor.device) mu_prime = T.clamp(mu_prime, self.min_action, self.max_action) return mu_prime.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.critic_1.device) # done = T.tensor(done).to(self.critic_1.device) state_ = T.tensor(new_state, dtype=T.float).to(self.critic_1.device) state = T.tensor(state, dtype=T.float).to(self.critic_1.device) action = T.tensor(action, dtype=T.float).to(self.critic_1.device) target_actions = self.target_actor.forward(state_) target_actions = target_actions + \ T.clamp(T.tensor(np.random.normal( scale=self.noise_targetAct)), -0.5, 0.5) target_actions = T.clamp(target_actions, self.min_action, self.max_action) q1_ = self.target_critic_1.forward(state_, target_actions).view(-1) q2_ = self.target_critic_2.forward(state_, target_actions).view(-1) # q1_[done] = 0.0 # In building context, the terminal state does not have 0 value # q2_[done] = 0.0 critic_value_ = T.min(q1_, q2_) target = reward + self.gamma * critic_value_ self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q1 = self.critic_1.forward(state, action).view(-1) q2 = self.critic_2.forward(state, action).view(-1) q1_loss = F.mse_loss(target, q1) q2_loss = F.mse_loss(target, q2) critic_loss = q1_loss + q2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.learn_step_cntr += 1 # if self.learn_step_cntr % self.update_actor_iter != 0: # return self.actor.optimizer.zero_grad() actor_q1_loss = self.critic_1.forward( state, self.actor.forward(state)) # can also use the mean # of actor_q1_loss and actor_q2_loss, but it would be slower and does not really matter actor_loss = -T.mean(actor_q1_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() return critic_loss.item(), actor_loss.item() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau updated_actor = update_single_target_network_parameters( self.actor, self.target_actor, tau) updated_critic_1 = update_single_target_network_parameters( self.critic_1, self.target_critic_1, tau) updated_critic_2 = update_single_target_network_parameters( self.critic_2, self.target_critic_2, tau) self.target_actor.load_state_dict(updated_actor) self.target_critic_1.load_state_dict(updated_critic_1) self.target_critic_2.load_state_dict(updated_critic_2) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() # self.target_actor.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() # self.target_critic_1.save_checkpoint() # self.target_critic_2.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() # self.target_actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint()
def __init__(self, input_dims, n_actions, layer_sizes, act_lr=0.00001, crt_lr=0.0001, tau=0.001, gamma=0.99, max_size=1000000, batch_size=64, update_actor_interval=2, noise=0.1, noise_targetAct=0.2, chkpt_dir='tmp/td3', name='td3', layerNorm=True): self.input_dims = input_dims self.n_actions = n_actions self.gamma = gamma self.tau = tau self.max_action = 1 self.min_action = -1 self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions) self.batch_size = batch_size self.learn_step_cntr = 0 self.update_actor_iter = update_actor_interval self.actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, layer_sizes, name='Actor_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.critic_1 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='Critic1_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.critic_2 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='Critic2_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.target_actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, layer_sizes, name='TargetActor_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.target_critic_1 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='TargetCritic1_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.target_critic_2 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, layer_sizes, name='TargetCritic2_' + name, chkpt_dir=chkpt_dir, layerNorm=layerNorm) self.noise = noise self.noise_targetAct = noise_targetAct self.update_network_parameters(tau=1)
class CycleGanModel(BaseModel): def name(self): return 'TrainCycleGanModel' def initialize(self, args): BaseModel.initialize(self, args) self.input_A = self.Tensor(args.batchSize, 3, 1024, 256) self.input_B = self.Tensor(args.batchSize, 3, 1024, 256) self.fake_A_Buffer = ReplayBuffer() self.fake_B_Buffer = ReplayBuffer() self.netG_AtoB = networks.define_G(3, 3, 64, 'resnet_9blocks', 'instance', False, args.init_type, self.gpu_ids) self.netG_BtoA = networks.define_G(3, 3, 64, 'resnet_9blocks', 'instance', False, args.init_type, self.gpu_ids) self.netD_A = networks.define_D(3, 64, 'basic', norm='instance', use_sigmoid=False, gpu_ids=args.gpu_ids) self.netD_B = networks.define_D(3, 64, 'basic', norm='instance', use_sigmoid=False, gpu_ids=args.gpu_ids) self.netG_AtoB.apply(weights_init_normal) self.netG_BtoA.apply(weights_init_normal) self.netD_A.apply(weights_init_normal) self.netD_B.apply(weights_init_normal) checkpoint_AtoB_filename = 'netG_A2B.pth' checkpoint_BtoA_filename = 'netG_B2A.pth' checkpoint_D_A_filename = 'netD_A.pth' checkpoint_D_B_filename = 'netD_B.pth' checkpoint_path_AtoB = os.path.join(args.checkpoints_dir, checkpoint_AtoB_filename) checkpoint_path_BtoA = os.path.join(args.checkpoints_dir, checkpoint_BtoA_filename) checkpoint_path_D_A = os.path.join(args.checkpoints_dir, checkpoint_D_A_filename) checkpoint_path_D_B = os.path.join(args.checkpoints_dir, checkpoint_D_B_filename) # Load checkpoint # self.netG_AtoB.load_state_dict(torch.load(checkpoint_path_AtoB)) # self.netG_BtoA.load_state_dict(torch.load(checkpoint_path_BtoA)) # self.netD_A.load_state_dict(torch.load(checkpoint_path_D_A)) # self.netD_B.load_state_dict(torch.load(checkpoint_path_D_B)) # define loss # self.criterionGAN = networks.GANLoss().to(self.device) self.criterionGAN = torch.nn.MSELoss().cuda() self.criterionCycle = torch.nn.L1Loss().cuda() self.criterionIdentity = torch.nn.L1Loss().cuda() # init optimizer self.optimizer_G = torch.optim.Adam(itertools.chain( self.netG_AtoB.parameters(), self.netG_BtoA.parameters()), lr=0.0001, betas=(0.5, 0.999)) self.optimizer_D_a = torch.optim.Adam(self.netD_A.parameters(), lr=0.0001, betas=(0.5, 0.999)) self.optimizer_D_b = torch.optim.Adam(self.netD_B.parameters(), lr=0.0001, betas=(0.5, 0.999)) self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR( self.optimizer_G, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) self.lr_scheduler_D_a = torch.optim.lr_scheduler.LambdaLR( self.optimizer_D_a, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) self.lr_scheduler_D_b = torch.optim.lr_scheduler.LambdaLR( self.optimizer_D_b, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) def set_input(self, input_real, input_fake): self.image_real_sizes = input_real['A_sizes'] input_A = input_real['A'] self.input_A.resize_(input_A.size()).copy_(input_A) self.image_real_paths = input_real['A_paths'] # self.size_real = (int(self.image_real_sizes[0]), int(self.image_real_sizes[1])) self.image_fake_sizes = input_fake['B_sizes'] input_B = input_fake['B'] self.input_B.resize_(input_B.size()).copy_(input_B) self.image_fake_paths = input_fake['B_paths'] # self.size_fake = (int(self.image_fake_sizes[0]), int(self.image_fake_sizes[1])) def train(self): real_A = Variable(self.input_A) real_B = Variable(self.input_B) target_real = Variable(self.Tensor(real_B.size(0), 1, 14, 62).fill_(1.0), requires_grad=False) target_fake = Variable(self.Tensor(real_B.size(0), 1, 14, 62).fill_(0.0), requires_grad=False) loss_gan = self.criterionGAN loss_cycle = self.criterionCycle loss_identity = self.criterionIdentity self.optimizer_G.zero_grad() i_b = self.netG_AtoB(real_B) loss_identity_B = loss_identity(i_b, real_B) * 0.5 i_a = self.netG_BtoA(real_A) loss_identity_A = loss_identity(i_a, real_A) * 0.5 fake_B = self.netG_AtoB(real_A) pred_fake = self.netD_B(fake_B) loss_gan_A2B = loss_gan(pred_fake, target_real) fake_A = self.netG_BtoA(real_B) pred_fake = self.netD_A(fake_A) loss_gan_B2A = loss_gan(pred_fake, target_real) recovered_a = self.netG_BtoA(fake_B) loss_cycle_A = loss_cycle(recovered_a, real_A) * 10.0 recovered_b = self.netG_AtoB(fake_A) loss_cycle_B = loss_cycle(recovered_b, real_B) * 10.0 loss_G = loss_identity_A + loss_identity_B + loss_gan_A2B + loss_gan_B2A + loss_cycle_A + loss_cycle_B loss_G.backward() self.optimizer_G.step() self.optimizer_D_a.zero_grad() pred_real = self.netD_A(real_A) loss_d_real = loss_gan(pred_real, target_real) fake_A = self.fake_A_Buffer.push_and_pop(fake_A) pred_fake = self.netD_A(fake_A.detach()) loss_d_fake = loss_gan(pred_fake, target_fake) loss_d_a = (loss_d_real + loss_d_fake) * 0.5 loss_d_a.backward() self.optimizer_D_a.step() self.optimizer_D_b.zero_grad() pred_real = self.netD_B(real_B) loss_d_real = loss_gan(pred_real, target_real) fake_B = self.fake_B_Buffer.push_and_pop(fake_B) pred_fake = self.netD_B(fake_B.detach()) loss_d_fake = loss_gan(pred_fake, target_fake) loss_d_b = (loss_d_real + loss_d_fake) * 0.5 loss_d_b.backward() self.optimizer_D_b.step() print( 'Generator Total Loss : {a:.3f}, Generator Identity Loss : {b:.3f}, Generator GAN Loss : {c:.3f}, ' 'Generator Cycle Loss : {d:.3f}'.format( a=loss_G, b=loss_identity_A + loss_identity_B, c=loss_gan_A2B + loss_gan_B2A, d=loss_cycle_A + loss_cycle_B)) print('Discriminator Loss : {a:.3f}'.format(a=loss_d_a + loss_d_b)) def update_learning_rate(self): self.lr_scheduler_G.step() self.lr_scheduler_D_a.step() self.lr_scheduler_D_b.step() def save_checkpoint(self): torch.save(self.netG_AtoB.state_dict(), './checkpoints/netG_A2B.pth') torch.save(self.netG_BtoA.state_dict(), './checkpoints/netG_B2A.pth') torch.save(self.netD_A.state_dict(), './checkpoints/netD_A.pth') torch.save(self.netD_B.state_dict(), './checkpoints/netD_B.pth') def forward(self): self.real_A = Variable(self.input_A) self.fake_B = self.netG_AtoB(self.real_A) def get_image_paths(self): return self.image_real_paths, self.image_fake_paths def get_image_sizes(self): return self.size_real, self.size_fake def get_current_visuals(self): real_A = util.tensor2im(self.real_A.data) fake_B = util.tensor2im(self.fake_B.data) return OrderedDict([('original', real_A), ('restyled', fake_B)])
def train(self, env): # Memory memory = ReplayBuffer(capacity=self.replay_size) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() while not done: if total_numsteps < self.start_steps: action = env.action_space.sample() # Sample random action else: # Sample action from policy action = self.select_action(state) if len(memory) > self.batch_size: # Number of updates per step in environment for i in range(self.updates_per_step): # Update parameters of all the networks q1_loss, q2_loss, policy_loss, alpha_loss = self.update_parameters( memory, self.batch_size, updates) updates += 1 next_state, reward, done, _ = env.step(action) # Step episode_steps += 1 total_numsteps += 1 episode_reward += reward if self.render: env.render() # Ignore the "done" signal if it comes from hitting the time horizon. # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) done = 0 if episode_steps == env._max_episode_steps else done memory.push(state, action, reward, next_state, done) # Append transition to memory state = next_state logger.info('UPDATE') logger.record_tabular('q1_loss', q1_loss) logger.record_tabular('q2_loss', q2_loss) logger.record_tabular('policy_loss', policy_loss) logger.record_tabular('alpha_loss', alpha_loss) logger.dump_tabular() logger.info('STATUS') logger.record_tabular('i_episode', i_episode) logger.record_tabular('episode_steps', episode_steps) logger.record_tabular('total_numsteps', total_numsteps) logger.record_tabular('episode_reward', episode_reward) logger.dump_tabular() if i_episode % 100 == 0: logger.info('SAVE') self.save_model('../saved/sac') if total_numsteps > self.num_steps: return
class Agent(BaseAgent): def __init__(self, env, **kwargs): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.obs_space = env.observation_space self.action_space = env.action_space super(Agent, self).__init__(env.action_space) mask = kwargs.get('mask', 2) mask_hi = kwargs.get('mask_hi', 19) self.rule = kwargs.get('rule', 'c') self.danger = kwargs.get('danger', 0.9) self.bus_thres = kwargs.get('threshold', 0.1) self.max_low_len = kwargs.get('max_low_len', 19) self.converter = graphGoalConverter(env, mask, mask_hi, self.danger, self.device, self.rule) self.thermal_limit = env._thermal_limit_a self.convert_obs = self.converter.convert_obs self.action_dim = self.converter.n self.order_dim = len(self.converter.masked_sorted_sub) self.node_num = env.dim_topo self.delay_step = 2 self.update_step = 0 self.k_step = 1 self.nheads = kwargs.get('head_number', 8) self.target_update = kwargs.get('target_update', 1) self.hard_target = kwargs.get('hard_target', False) self.use_order = (self.rule == 'o') self.gamma = kwargs.get('gamma', 0.99) self.tau = kwargs.get('tau', 1e-3) self.dropout = kwargs.get('dropout', 0.) self.memlen = kwargs.get('memlen', int(1e5)) self.batch_size = kwargs.get('batch_size', 128) self.update_start = self.batch_size * 8 self.actor_lr = kwargs.get('actor_lr', 5e-5) self.critic_lr = kwargs.get('critic_lr', 5e-5) self.embed_lr = kwargs.get('embed_lr', 5e-5) self.alpha_lr = kwargs.get('alpha_lr', 5e-5) self.state_dim = kwargs.get('state_dim', 128) self.n_history = kwargs.get('n_history', 6) self.input_dim = self.converter.n_feature * self.n_history print( f'N: {self.node_num}, O: {self.input_dim}, S: {self.state_dim}, A: {self.action_dim}, ({self.order_dim})' ) print(kwargs) self.emb = EncoderLayer(self.input_dim, self.state_dim, self.nheads, self.node_num, self.dropout).to(self.device) self.temb = EncoderLayer(self.input_dim, self.state_dim, self.nheads, self.node_num, self.dropout).to(self.device) self.Q = DoubleSoftQ(self.state_dim, self.nheads, self.node_num, self.action_dim, self.use_order, self.order_dim, self.dropout).to(self.device) self.tQ = DoubleSoftQ(self.state_dim, self.nheads, self.node_num, self.action_dim, self.use_order, self.order_dim, self.dropout).to(self.device) self.actor = Actor(self.state_dim, self.nheads, self.node_num, self.action_dim, self.use_order, self.order_dim, self.dropout).to(self.device) # copy parameters self.tQ.load_state_dict(self.Q.state_dict()) self.temb.load_state_dict(self.emb.state_dict()) # entropy self.target_entropy = -self.action_dim * 3 if not self.use_order else -3 * ( self.action_dim + self.order_dim) self.log_alpha = torch.FloatTensor([-3]).to(self.device) self.log_alpha.requires_grad = True # optimizers self.Q.optimizer = optim.Adam(self.Q.parameters(), lr=self.critic_lr) self.actor.optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.emb.optimizer = optim.Adam(self.emb.parameters(), lr=self.embed_lr) self.alpha_optim = optim.Adam([self.log_alpha], lr=self.alpha_lr) self.memory = ReplayBuffer(max_size=self.memlen) self.Q.eval() self.tQ.eval() self.emb.eval() self.temb.eval() self.actor.eval() def is_safe(self, obs): for ratio, limit in zip(obs.rho, self.thermal_limit): # Seperate big line and small line if (limit < 400.00 and ratio >= self.danger - 0.05) or ratio >= self.danger: return False return True def load_mean_std(self, mean, std): self.state_mean = mean self.state_std = std.masked_fill(std < 1e-5, 1.) self.state_mean[0, sum(self.obs_space.shape[:20]):] = 0 self.state_std[0, sum(self.action_space.shape[:20]):] = 1 def state_normalize(self, s): s = (s - self.state_mean) / self.state_std return s def reset(self, obs): self.converter.last_topo = np.ones(self.node_num, dtype=int) self.topo = None self.goal = None self.goal_list = [] self.low_len = -1 self.adj = None self.stacked_obs = [] self.low_actions = [] self.save = False def cache_stat(self): cache = { 'last_topo': self.converter.last_topo, 'topo': self.topo, 'goal': self.goal, 'goal_list': self.goal_list, 'low_len': self.low_len, 'adj': self.adj, 'stacked_obs': self.stacked_obs, 'low_actions': self.low_actions, 'save': self.save, } return cache def load_cache_stat(self, cache): self.converter.last_topo = cache['last_topo'] self.topo = cache['topo'] self.goal = cache['goal'] self.goal_list = cache['goal_list'] self.low_len = cache['low_len'] self.adj = cache['adj'] self.stacked_obs = cache['stacked_obs'] self.low_actions = cache['low_actions'] self.save = cache['save'] def hash_goal(self, goal): hashed = '' for i in goal.view(-1): hashed += str(int(i.item())) return hashed def stack_obs(self, obs): obs_vect = obs.to_vect() obs_vect = torch.FloatTensor(obs_vect).unsqueeze(0) obs_vect, self.topo = self.convert_obs(self.state_normalize(obs_vect)) if len(self.stacked_obs) == 0: for _ in range(self.n_history): self.stacked_obs.append(obs_vect) else: self.stacked_obs.pop(0) self.stacked_obs.append(obs_vect) self.adj = (torch.FloatTensor(obs.connectivity_matrix()) + torch.eye(int(obs.dim_topo))).to(self.device) self.converter.last_topo = np.where(obs.topo_vect == -1, self.converter.last_topo, obs.topo_vect) def reconnect_line(self, obs): # if the agent can reconnect powerline not included in controllable substation, return action # otherwise, return None dislines = np.where(obs.line_status == False)[0] for i in dislines: act = None if obs.time_next_maintenance[ i] != 0 and i in self.converter.lonely_lines: sub_or = self.action_space.line_or_to_subid[i] sub_ex = self.action_space.line_ex_to_subid[i] if obs.time_before_cooldown_sub[sub_or] == 0: act = self.action_space( {'set_bus': { 'lines_or_id': [(i, 1)] }}) if obs.time_before_cooldown_sub[sub_ex] == 0: act = self.action_space( {'set_bus': { 'lines_ex_id': [(i, 1)] }}) if obs.time_before_cooldown_line[i] == 0: status = self.action_space.get_change_line_status_vect() status[i] = True act = self.action_space({'change_line_status': status}) if act is not None: return act return None def get_current_state(self): return torch.cat(self.stacked_obs + [self.topo], dim=-1) def act(self, obs, reward, done): sample = (reward is None) self.stack_obs(obs) is_safe = self.is_safe(obs) self.save = False # reconnect powerline when the powerline in uncontrollable substations is disconnected if False in obs.line_status: act = self.reconnect_line(obs) if act is not None: return act # generate goal if it is initial or previous goal has been reached if self.goal is None or (not is_safe and self.low_len == -1): goal, bus_goal, low_actions, order, Q1, Q2 = self.generate_goal( sample, obs, not sample) if len(low_actions) == 0: act = self.action_space() if self.goal is None: self.update_goal(goal, bus_goal, low_actions, order, Q1, Q2) return self.action_space() self.update_goal(goal, bus_goal, low_actions, order, Q1, Q2) act = self.pick_low_action(obs) return act def pick_low_action(self, obs): # Safe and there is no queued low actions, just do nothing if self.is_safe(obs) and self.low_len == -1: act = self.action_space() return act # optimize low actions every step self.low_actions = self.optimize_low_actions(obs, self.low_actions) self.low_len += 1 # queue has been empty after optimization. just do nothing if len(self.low_actions) == 0: act = self.action_space() self.low_len = -1 # normally execute low action from low actions queue else: sub_id, new_topo = self.low_actions.pop(0)[:2] act = self.converter.convert_act(sub_id, new_topo, obs.topo_vect) # When it meets maximum low action execution time, log and reset if self.max_low_len <= self.low_len: self.low_len = -1 return act def high_act(self, stacked_state, adj, sample=True): order, Q1, Q2 = None, 0, 0 with torch.no_grad(): # stacked_state # B, N, F stacked_t, stacked_x = stacked_state[..., -1:], stacked_state[..., :-1] emb_input = stacked_x state = self.emb(emb_input, adj).detach() actor_input = [state, stacked_t.squeeze(-1)] if sample: action, std = self.actor.sample(actor_input, adj) if self.use_order: action, order = action critic_input = action Q1, Q2 = self.Q(state, critic_input, adj, order) Q1, Q2 = Q1.detach()[0].item(), Q2.detach()[0].item() if self.use_order: std, order_std = std else: action = self.actor.mean(actor_input, adj) if self.use_order: action, order = action if order is not None: order = order.detach().cpu() return action.detach().cpu(), order, Q1, Q2 def make_candidate_goal(self, stacked_state, adj, sample, obs): goal, order, Q1, Q2 = self.high_act(stacked_state, adj, sample) bus_goal = torch.zeros_like(goal).long() bus_goal[goal > self.bus_thres] = 1 low_actions = self.converter.plan_act( bus_goal, obs.topo_vect, order[0] if order is not None else None) low_actions = self.optimize_low_actions(obs, low_actions) return goal, bus_goal, low_actions, order, Q1, Q2 def generate_goal(self, sample, obs, nosave=False): stacked_state = self.get_current_state().to(self.device) adj = self.adj.unsqueeze(0) goal, bus_goal, low_actions, order, Q1, Q2 = self.make_candidate_goal( stacked_state, adj, sample, obs) return goal, bus_goal, low_actions, order, Q1, Q2 def update_goal(self, goal, bus_goal, low_actions, order=None, Q1=0, Q2=0): self.order = order self.goal = goal self.bus_goal = bus_goal self.low_actions = low_actions self.low_len = 0 self.save = True self.goal_list.append(self.hash_goal(bus_goal)) def optimize_low_actions(self, obs, low_actions): # remove overlapped action optimized = [] cooldown_list = obs.time_before_cooldown_sub if self.max_low_len != 1 and self.rule == 'c': low_actions = self.converter.heuristic_order(obs, low_actions) for low_act in low_actions: sub_id, sub_goal = low_act[:2] sub_goal, same = self.converter.inspect_act( sub_id, sub_goal, obs.topo_vect) if not same: optimized.append((sub_id, sub_goal, cooldown_list[sub_id])) # sort by cooldown_sub if self.max_low_len != 1 and self.rule != 'o': optimized = sorted(optimized, key=lambda x: x[2]) # if current action has cooldown, then discard if len(optimized) > 0 and optimized[0][2] > 0: optimized = [] return optimized def append_sample(self, s, m, a, r, s2, m2, d, order): if self.use_order: self.memory.append((s, m, a, r, s2, m2, int(d), order)) else: self.memory.append((s, m, a, r, s2, m2, int(d))) def unpack_batch(self, batch): if self.use_order: states, adj, actions, rewards, states2, adj2, dones, orders = list( zip(*batch)) orders = torch.cat(orders, 0) else: states, adj, actions, rewards, states2, adj2, dones = list( zip(*batch)) states = torch.cat(states, 0) states2 = torch.cat(states2, 0) adj = torch.stack(adj, 0) adj2 = torch.stack(adj2, 0) actions = torch.cat(actions, 0) rewards = torch.FloatTensor(rewards).unsqueeze(1) dones = torch.FloatTensor(dones).unsqueeze(1) if self.use_order: return states.to(self.device), adj.to(self.device), actions.to(self.device), rewards.to(self.device), \ states2.to(self.device), adj2.to(self.device), dones.to(self.device), orders.to(self.device) else: return states.to(self.device), adj.to(self.device), actions.to(self.device), \ rewards.to(self.device), states2.to(self.device), adj2.to(self.device), dones.to(self.device) def update(self): self.update_step += 1 batch = self.memory.sample(self.batch_size) orders = None if self.use_order: stacked_states, adj, actions, rewards, stacked_states2, adj2, dones, orders = self.unpack_batch( batch) else: stacked_states, adj, actions, rewards, stacked_states2, adj2, dones = self.unpack_batch( batch) self.Q.train() self.emb.train() self.actor.eval() # critic loss stacked_t, stacked_x = stacked_states[..., -1:], stacked_states[..., :-1] stacked2_t, stacked2_x = stacked_states2[..., -1:], stacked_states2[ ..., :-1] emb_input = stacked_x emb_input2 = stacked2_x states = self.emb(emb_input, adj) states2 = self.emb(emb_input2, adj2) actor_input2 = [states2, stacked2_t.squeeze(-1)] with torch.no_grad(): tstates2 = self.temb(emb_input2, adj2).detach() action2, log_pi2 = self.actor.rsample(actor_input2, adj2) order2 = None if self.use_order: action2, order2 = action2 log_pi2 = log_pi2[0] + log_pi2[1] critic_input2 = action2 targets = self.tQ.min_Q(tstates2, critic_input2, adj2, order2) - self.log_alpha.exp() * log_pi2 targets = rewards + (1 - dones) * self.gamma * targets.detach() critic_input = actions predQ1, predQ2 = self.Q(states, critic_input, adj, orders) Q1_loss = F.mse_loss(predQ1, targets) Q2_loss = F.mse_loss(predQ2, targets) loss = Q1_loss + Q2_loss self.Q.optimizer.zero_grad() self.emb.optimizer.zero_grad() loss.backward() self.emb.optimizer.step() self.Q.optimizer.step() self.Q.eval() if self.update_step % self.delay_step == 0: # actor loss self.actor.train() states = self.emb(emb_input, adj) actor_input = [states, stacked_t.squeeze(-1)] action, log_pi = self.actor.rsample(actor_input, adj) order = None if self.use_order: action, order = action log_pi = log_pi[0] + log_pi[1] critic_input = action actor_loss = ( self.log_alpha.exp() * log_pi - self.Q.min_Q(states, critic_input, adj, order)).mean() self.emb.optimizer.zero_grad() self.actor.optimizer.zero_grad() actor_loss.backward() self.emb.optimizer.step() self.actor.optimizer.step() self.actor.eval() # target update if self.hard_target: self.tQ.load_state_dict(self.Q.state_dict()) self.temb.load_state_dict(self.emb.state_dict()) else: for tp, p in zip(self.tQ.parameters(), self.Q.parameters()): tp.data.copy_(self.tau * p + (1 - self.tau) * tp) for tp, p in zip(self.temb.parameters(), self.emb.parameters()): tp.data.copy_(self.tau * p + (1 - self.tau) * tp) # alpha loss alpha_loss = self.log_alpha * (-log_pi.detach() - self.target_entropy).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.emb.eval() return predQ1.detach().mean().item(), predQ2.detach().mean().item() def save_model(self, path, name): torch.save(self.actor.state_dict(), os.path.join(path, f'{name}_actor.pt')) torch.save(self.emb.state_dict(), os.path.join(path, f'{name}_emb.pt')) torch.save(self.Q.state_dict(), os.path.join(path, f'{name}_Q.pt')) def load_model(self, path, name=None): head = '' if name is not None: head = name + '_' self.actor.load_state_dict( torch.load(os.path.join(path, f'{head}actor.pt'), map_location=self.device)) self.emb.load_state_dict( torch.load(os.path.join(path, f'{head}emb.pt'), map_location=self.device)) self.Q.load_state_dict( torch.load(os.path.join(path, f'{head}Q.pt'), map_location=self.device))
optimizer_D_A, lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step) lr_scheduler_D_B = torch.optim.lr_scheduler.LambdaLR( optimizer_D_B, lr_lambda=LambdaLR(opt.n_epochs, opt.epoch, opt.decay_epoch).step) # Inputs & targets memory allocation Tensor = torch.cuda.FloatTensor if opt.cuda else torch.Tensor input_A = Tensor(opt.batchSize, opt.input_nc, opt.size, opt.size) input_B = Tensor(opt.batchSize, opt.output_nc, opt.size, opt.size) target_real = Variable(Tensor(opt.batchSize, 1).fill_(1.0), requires_grad=False) target_fake = Variable(Tensor(opt.batchSize, 1).fill_(0.0), requires_grad=False) fake_A_buffer = ReplayBuffer() fake_Am_buffer = ReplayBuffer() fake_B_buffer = ReplayBuffer() fake_Bm_buffer = ReplayBuffer() # Dataset loader transforms_ = [ transforms.Resize(int(opt.size * 1.12), Image.BICUBIC), transforms.RandomCrop(opt.size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.5], [0.5]) ] dataloader = DataLoader(Dataset(opt.dataroot, transforms_=transforms_, unaligned=True),
class DDPGAgent(): def __init__(self, state_size, action_size, num_agents): self.state_size = state_size self.action_size = action_size self.seed = random.seed(RANDOM_SEED) self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Directory where to save the model self.model_dir = os.getcwd() + "/DDPG/saved_models" os.makedirs(self.model_dir, exist_ok=True) def step(self, states, actions, rewards, next_states, dones): for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # adds gradient clipping to stabilize learning self.critic_optimizer.step() actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_model(self): torch.save( self.actor_local.state_dict(), os.path.join(self.model_dir, 'actor_params.pth') ) torch.save( self.actor_optimizer.state_dict(), os.path.join(self.model_dir, 'actor_optim_params.pth') ) torch.save( self.critic_local.state_dict(), os.path.join(self.model_dir, 'critic_params.pth') ) torch.save( self.critic_optimizer.state_dict(), os.path.join(self.model_dir, 'critic_optim_params.pth') ) def load_model(self): """Loads weights from saved model.""" self.actor_local.load_state_dict( torch.load(os.path.join(self.model_dir, 'actor_params.pth')) ) self.actor_optimizer.load_state_dict( torch.load(os.path.join(self.model_dir, 'actor_optim_params.pth')) ) self.critic_local.load_state_dict( torch.load(os.path.join(self.model_dir, 'critic_params.pth')) ) self.critic_optimizer.load_state_dict( torch.load(os.path.join(self.model_dir, 'critic_optim_params.pth')) )
def __init__(self, env, **kwargs): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.obs_space = env.observation_space self.action_space = env.action_space super(Agent, self).__init__(env.action_space) mask = kwargs.get('mask', 2) mask_hi = kwargs.get('mask_hi', 19) self.rule = kwargs.get('rule', 'c') self.danger = kwargs.get('danger', 0.9) self.bus_thres = kwargs.get('threshold', 0.1) self.max_low_len = kwargs.get('max_low_len', 19) self.converter = graphGoalConverter(env, mask, mask_hi, self.danger, self.device, self.rule) self.thermal_limit = env._thermal_limit_a self.convert_obs = self.converter.convert_obs self.action_dim = self.converter.n self.order_dim = len(self.converter.masked_sorted_sub) self.node_num = env.dim_topo self.delay_step = 2 self.update_step = 0 self.k_step = 1 self.nheads = kwargs.get('head_number', 8) self.target_update = kwargs.get('target_update', 1) self.hard_target = kwargs.get('hard_target', False) self.use_order = (self.rule == 'o') self.gamma = kwargs.get('gamma', 0.99) self.tau = kwargs.get('tau', 1e-3) self.dropout = kwargs.get('dropout', 0.) self.memlen = kwargs.get('memlen', int(1e5)) self.batch_size = kwargs.get('batch_size', 128) self.update_start = self.batch_size * 8 self.actor_lr = kwargs.get('actor_lr', 5e-5) self.critic_lr = kwargs.get('critic_lr', 5e-5) self.embed_lr = kwargs.get('embed_lr', 5e-5) self.alpha_lr = kwargs.get('alpha_lr', 5e-5) self.state_dim = kwargs.get('state_dim', 128) self.n_history = kwargs.get('n_history', 6) self.input_dim = self.converter.n_feature * self.n_history print( f'N: {self.node_num}, O: {self.input_dim}, S: {self.state_dim}, A: {self.action_dim}, ({self.order_dim})' ) print(kwargs) self.emb = EncoderLayer(self.input_dim, self.state_dim, self.nheads, self.node_num, self.dropout).to(self.device) self.temb = EncoderLayer(self.input_dim, self.state_dim, self.nheads, self.node_num, self.dropout).to(self.device) self.Q = DoubleSoftQ(self.state_dim, self.nheads, self.node_num, self.action_dim, self.use_order, self.order_dim, self.dropout).to(self.device) self.tQ = DoubleSoftQ(self.state_dim, self.nheads, self.node_num, self.action_dim, self.use_order, self.order_dim, self.dropout).to(self.device) self.actor = Actor(self.state_dim, self.nheads, self.node_num, self.action_dim, self.use_order, self.order_dim, self.dropout).to(self.device) # copy parameters self.tQ.load_state_dict(self.Q.state_dict()) self.temb.load_state_dict(self.emb.state_dict()) # entropy self.target_entropy = -self.action_dim * 3 if not self.use_order else -3 * ( self.action_dim + self.order_dim) self.log_alpha = torch.FloatTensor([-3]).to(self.device) self.log_alpha.requires_grad = True # optimizers self.Q.optimizer = optim.Adam(self.Q.parameters(), lr=self.critic_lr) self.actor.optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr) self.emb.optimizer = optim.Adam(self.emb.parameters(), lr=self.embed_lr) self.alpha_optim = optim.Adam([self.log_alpha], lr=self.alpha_lr) self.memory = ReplayBuffer(max_size=self.memlen) self.Q.eval() self.tQ.eval() self.emb.eval() self.temb.eval() self.actor.eval()
class GanModel(BaseModel): def name(self): return 'TrainGanModel' def initialize(self, args): BaseModel.initialize(self, args) self.input_B = self.Tensor(args.batchSize, 3, 1024, 256) self.input_C = self.Tensor(args.batchSize, 1, 1024, 256) self.fake_Buffer = ReplayBuffer() self.netG_BtoC = networks.define_G(3, 1, 64, 'unet_128', 'batch', False, args.init_type, self.gpu_ids) self.netD_C = networks.define_D(1, 64, 'basic', norm='batch', use_sigmoid=False, gpu_ids=args.gpu_ids) self.netG_BtoC.apply(weights_init_normal) self.netD_C.apply(weights_init_normal) checkpoint_BtoC_filename = 'netG_B2C.pth' checkpoint_D_C_filename = 'netD_C.pth' checkpoint_path_BtoC = os.path.join(args.checkpoints_dir, checkpoint_BtoC_filename) checkpoint_path_D_C = os.path.join(args.checkpoints_dir, checkpoint_D_C_filename) # Load checkpoint # self.netG_BtoC.load_state_dict(torch.load(checkpoint_path_BtoC)) # self.netD_C.load_state_dict(torch.load(checkpoint_path_D_C)) # define loss self.criterionGAN = torch.nn.MSELoss() self.criterionReconstruction = torch.nn.L1Loss().cuda() # init optimizer self.optimizer_G = torch.optim.Adam(self.netG_BtoC.parameters(), lr=0.0002, betas=(0.5, 0.999)) self.optimizer_D = torch.optim.Adam(self.netD_C.parameters(), lr=0.0002, betas=(0.5, 0.999)) self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR( self.optimizer_G, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) self.lr_scheduler_D = torch.optim.lr_scheduler.LambdaLR( self.optimizer_D, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) def set_input(self, input): self.image_syn_sizes = input['B_sizes'] input_B = input['B'] save_image(input_B[0], './input_check/rgb.jpg') self.input_B.resize_(input_B.size()).copy_(input_B) self.image_syn_paths = input['B_paths'] # self.size_syn = (int(self.image_syn_sizes[0]), int(self.image_syn_sizes[1])) self.image_dep_sizes = input['C_sizes'] input_C = input['C'] save_image(input_C[0], './input_check/depth.jpg') self.input_C.resize_(input_C.size()).copy_(input_C) self.image_dep_paths = input['C_paths'] # self.size_dep = (int(self.image_dep_sizes[0]), int(self.image_dep_sizes[1])) def train(self): syn_data = Variable(self.input_B) dep_data = Variable(self.input_C) target_real = Variable(self.Tensor(syn_data.size(0), 1, 14, 62).fill_(1.0), requires_grad=False) target_fake = Variable(self.Tensor(syn_data.size(0), 1, 14, 62).fill_(0.0), requires_grad=False) loss_gan = self.criterionGAN loss_rec = self.criterionReconstruction self.optimizer_G.zero_grad() fake_dep = self.netG_BtoC(syn_data) loss_r = loss_rec(fake_dep, dep_data) loss_g = loss_gan(self.netD_C(fake_dep), target_real) loss_G = 0.01 * loss_g + 0.99 * loss_r # loss_G = loss_g loss_G.backward() self.optimizer_G.step() self.optimizer_D.zero_grad() pred_real = self.netD_C(dep_data) loss_real = loss_gan(pred_real, target_real) fake_A = self.fake_Buffer.push_and_pop(fake_dep) pred_fake = self.netD_C(fake_A) loss_fake = loss_gan(pred_fake, target_fake) loss_D = (loss_real + loss_fake) * 0.5 loss_D.backward() self.optimizer_D.step() print( 'Generator Loss : {loss_G:.5f}, Discriminator Loss : {loss_D:.5f}'. format(loss_G=loss_G, loss_D=loss_D)) def update_learning_rate(self): self.lr_scheduler_G.step() self.lr_scheduler_D.step() def save_checkpoint(self): torch.save(self.netG_BtoC.state_dict(), './checkpoints/netG_B2C.pth') torch.save(self.netD_C.state_dict(), './checkpoints/netD_C.pth') def forward(self): self.syn_data = Variable(self.input_B) self.pred_depth = self.netG_BtoC(self.syn_data) def get_image_paths(self): return self.image_syn_paths, self.image_dep_paths def get_image_sizes(self): return self.size_syn, self.size_dep def get_current_visuals(self): syn_d = util.tensor2im(self.syn_data.data) pred_d = util.tensor2im(self.pred_depth.data) return OrderedDict([('original', syn_d), ('depth', pred_d)])
class Agent: def __init__(self, gamma=0.999, buffer_size=1e5, batch_size=1024, episodes_nr=50000, tau=2e-2, gym_name='MountainCarContinuous-v0'): self.lr_actor = 5e-3 # learning rate for the actor self.lr_critic = 1e-3 # learning rate for the critic self.lr_decay = 1 # learning rate decay (per episode) self.l2_reg_actor = 1e-7 # L2 regularization factor for the actor self.l2_reg_critic = 1e-7 # L2 regularization factor for the critic self.num_episodes = episodes_nr # number of episodes self.max_steps_ep = 10000 # default max number of steps per episode (unless env has a lower hardcoded limit) self.train_every = 1 # number of steps to run the policy (and collect experience) before updating network weights self.replay_memory_capacity = buffer_size # capacity of experience replay memory self.batch_size = batch_size self.memory = ReplayBuffer(int(buffer_size)) self.episodes_nr = episodes_nr self.gamma = gamma self.tau = tau self.env = gym.make(gym_name) assert(self.env.action_space.high == -self.env.action_space.low) self.action_range = self.env.action_space.high[0] self.action_dim = np.prod(np.array(self.env.action_space.shape)) self.state_dim = np.prod(np.array(self.env.observation_space.shape)) #self.noise = OUNoise(self.action_dim) self.action_range = self.env.action_space.high - self.env.action_space.low self.initial_noise_scale = 0.1 # scale of the exploration noise process (1.0 is the range of each action dimension) self.noise_decay = 1 #0.99 # decay rate (per episode) of the scale of the exploration noise process self.exploration_mu = 0.0 # mu parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt self.exploration_theta = 0.15 # theta parameter for the exploration noise process: dXt = theta*(mu-Xt)*dt + sigma*dWt self.exploration_sigma = 0.2 # sigma parameter for the exploration noise process: dXt = theta*(mu-Xt )*dt + sigma*dWt self.noise = OUNoise(self.action_dim) def run(self): tf.reset_default_graph() state_ph = tf.placeholder(dtype=tf.float32, shape=[None,self.state_dim]) action_ph = tf.placeholder(dtype=tf.float32, shape=[None,self.action_dim]) reward_ph = tf.placeholder(dtype=tf.float32, shape=[None]) next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None,self.state_dim]) is_not_terminal_ph = tf.placeholder(dtype=tf.float32, shape=[None]) # indicators (go into target computation) # episode counter episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_inc_op = episodes.assign_add(1) actions = Actor(state_ph, self.action_range, self.action_dim, "local").out target_actions = tf.stop_gradient(Actor(next_state_ph, self.action_range, self.action_dim, "target").out) q_det = Critic(action_ph, state_ph, "local", reuse=False).q q_inf = Critic(actions, state_ph, "local", reuse=True).q target_critic = tf.stop_gradient(Critic(target_actions, next_state_ph, "target").q) actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor_local') slow_target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor_target') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic_local') slow_target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic_target') update_targets_ops = [] for i, slow_target_actor_var in enumerate(slow_target_actor_vars): update_slow_target_actor_op = slow_target_actor_var.assign(self.tau*actor_vars[i]+(1-self.tau)*slow_target_actor_var) update_targets_ops.append(update_slow_target_actor_op) for i, slow_target_var in enumerate(slow_target_critic_vars): update_slow_target_critic_op = slow_target_var.assign(self.tau*critic_vars[i]+(1-self.tau)*slow_target_var) update_targets_ops.append(update_slow_target_critic_op) update_slow_targets_op = tf.group(*update_targets_ops, name='update_slow_targets') targets = tf.expand_dims(reward_ph, 1) + tf.expand_dims(is_not_terminal_ph, 1) * self.gamma * target_critic td_errors = targets - q_det critic_loss = tf.reduce_mean(tf.square(td_errors)) for var in critic_vars: if not 'bias' in var.name: critic_loss += self.l2_reg_critic * 0.5 * tf.nn.l2_loss(var) # critic optimizer critic_train_op = tf.train.AdamOptimizer(self.lr_critic*self.lr_decay**episodes).minimize(critic_loss) # actor loss function (mean Q-values under current policy with regularization) actor_loss = -1*tf.reduce_mean(q_inf) for var in actor_vars: if not 'bias' in var.name: actor_loss += self.l2_reg_actor * 0.5 * tf.nn.l2_loss(var) # actor optimizer # the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed) actor_train_op = tf.train.AdamOptimizer(self.lr_actor*self.lr_decay**episodes).minimize(actor_loss, var_list=actor_vars) # initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) total_steps = 0 for ep in range(self.num_episodes): total_reward = 0 steps_in_ep = 0 #noise_process = np.zeros(self.action_dim) #noise_scale = (self.initial_noise_scale * self.noise_decay**ep) * self.action_range # Initial state observation = self.env.reset() if ep%1 == 0: self.env.render() for t in range(self.max_steps_ep): # choose action based on deterministic policy action_for_state, = sess.run(actions, feed_dict = {state_ph: observation[None]}) # add temporally-correlated exploration noise to action (using an Ornstein-Uhlenbeck process) # print(action_for_state) #noise_process = self.exploration_theta*(self.exploration_mu - noise_process) + self.exploration_sigma*np.random.randn(self.action_dim) # print(noise_scale*noise_process) action_for_state += self.noise.sample() #noise_process #*noise_scale # take step next_observation, reward, done, _info = self.env.step(action_for_state) if ep%1 == 0: self.env.render() total_reward += reward self.memory.add_to_memory((observation, action_for_state, reward, next_observation, 0.0 if done else 1.0)) # update network weights to fit a minibatch of experience if total_steps%self.train_every == 0 and self.memory.len() >= self.batch_size: # grab N (s,a,r,s') tuples from replay memory minibatch = self.memory.sample_from_memory(self.batch_size) # update the critic and actor params using mean-square value error and deterministic policy gradient, respectively _, _ = sess.run([critic_train_op, actor_train_op], feed_dict = { state_ph: np.asarray([elem[0] for elem in minibatch]), action_ph: np.asarray([elem[1] for elem in minibatch]), reward_ph: np.asarray([elem[2] for elem in minibatch]), next_state_ph: np.asarray([elem[3] for elem in minibatch]), is_not_terminal_ph: np.asarray([elem[4] for elem in minibatch])}) # update slow actor and critic targets towards current actor and critic _ = sess.run(update_slow_targets_op) observation = next_observation total_steps += 1 steps_in_ep += 1 if done: # Increment episode counter _ = sess.run(episode_inc_op) break print('Episode %2i, Reward: %7.3f, Steps: %i'%(ep,total_reward,steps_in_ep)) env.close()
class Agent(): def __init__( self, input_dims, n_actions, layer_sizes, act_lr=0.00003, crt_lr=0.0003, gamma=0.99, max_size=1000000, tau=0.005, batch_size=64, reward_scale=1, name='sac', chkpt_dir='tmp/ddpg', layerNorm=True, ): '''Higher reward scale means higher weights given to rewards ratehr than entropy''' self.gamma = gamma self.tau = tau self.batch_size = batch_size self.input_dims = input_dims self.n_actions = n_actions # The env action was scaled to [-1, 1] self.max_action = np.ones(self.n_actions) # Cannot use env.action_space.high, because env.action_space.high is not real action space self.layer_sizes = layer_sizes self.layerNorm = layerNorm self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions) self.actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, self.max_action, fc_dims=self.layer_sizes, name='Actor_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.critic_1 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='critic1_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.critic_2 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='critic2_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.value = ValueNetwork(crt_lr, self.input_dims, self.layer_sizes, name='value_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.target_value = ValueNetwork(crt_lr, self.input_dims, self.layer_sizes, name='target_value_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.scale = reward_scale self.update_network_parameters(tau=1) def choose_action(self, observation): state = T.Tensor([observation]).to(self.actor.device) actions, _ = self.actor.sample_normal(state, reparameterize=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau updated_value = update_single_target_network_parameters( self.value, self.target_value, tau) self.target_value.load_state_dict(updated_value) def save_models(self): print('.... saving models ....') self.actor.save_checkpoint() self.value.save_checkpoint() # self.target_value.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): print('.... loading models ....') self.actor.load_checkpoint() self.value.load_checkpoint() # self.target_value.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) reward = T.tensor(reward, dtype=T.float).to(self.actor.device) done = T.tensor(done).to(self.actor.device) state_ = T.tensor(new_state, dtype=T.float).to(self.actor.device) state = T.tensor(state, dtype=T.float).to(self.actor.device) action = T.tensor(action, dtype=T.float).to(self.actor.device) # Update the value network self.value.optimizer.zero_grad() value = self.value.forward(state).view(-1) actions, log_probs = self.actor.sample_normal(state, reparameterize=False) log_probs = log_probs.view(-1) # Use the action from the current policy, rather than the one stored in the buffer q1_new_policy = self.critic_1.forward(state, actions).view(-1) q2_new_policy = self.critic_2.forward(state, actions).view(-1) critic_value = T.min(q1_new_policy, q2_new_policy) value_target = critic_value - log_probs # - log_probs is entropy value_loss = F.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.optimizer.step() # Update the critic network self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() # action and state are from replay buffer generated by old policy q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) value_ = self.target_value.forward(state_).view(-1) # value_[done] = 0.0 # In building context, terminal state does not have 0 value q_hat = self.scale * reward + self.gamma * value_ critic_1_loss = F.mse_loss(q1_old_policy, q_hat) critic_2_loss = F.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() # Update the actor network self.actor.optimizer.zero_grad() actions, log_probs = self.actor.sample_normal(state, reparameterize=True) log_probs = log_probs.view(-1) # Use the action from the current policy, rather than the one stored in the buffer q1_new_policy = self.critic_1.forward(state, actions).view(-1) q2_new_policy = self.critic_2.forward(state, actions).view(-1) critic_value = T.min(q1_new_policy, q2_new_policy) actor_loss = log_probs - critic_value actor_loss = T.mean(actor_loss) actor_loss.backward(retain_graph=True) self.actor.optimizer.step() self.update_network_parameters() return critic_loss.item(), actor_loss.item()
def __init__( self, input_dims, n_actions, layer_sizes, act_lr=0.00003, crt_lr=0.0003, gamma=0.99, max_size=1000000, tau=0.005, batch_size=64, reward_scale=1, name='sac', chkpt_dir='tmp/ddpg', layerNorm=True, ): '''Higher reward scale means higher weights given to rewards ratehr than entropy''' self.gamma = gamma self.tau = tau self.batch_size = batch_size self.input_dims = input_dims self.n_actions = n_actions # The env action was scaled to [-1, 1] self.max_action = np.ones(self.n_actions) # Cannot use env.action_space.high, because env.action_space.high is not real action space self.layer_sizes = layer_sizes self.layerNorm = layerNorm self.memory = ReplayBuffer(max_size, self.input_dims, self.n_actions) self.actor = ActorNetwork(act_lr, self.input_dims, self.n_actions, self.max_action, fc_dims=self.layer_sizes, name='Actor_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.critic_1 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='critic1_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.critic_2 = CriticNetwork(crt_lr, self.input_dims, self.n_actions, self.layer_sizes, name='critic2_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.value = ValueNetwork(crt_lr, self.input_dims, self.layer_sizes, name='value_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.target_value = ValueNetwork(crt_lr, self.input_dims, self.layer_sizes, name='target_value_' + name, chkpt_dir=chkpt_dir, layerNorm=self.layerNorm) self.scale = reward_scale self.update_network_parameters(tau=1)
class DDPG: def __init__( self, env, gamma=0.99, polyak=0.995, act_noise=0.1, render=False, batch_size=32, q_lr=1e-3, p_lr=1e-4, buffer_capacity=5000, max_episodes=100, save_path=None, load_path=None, print_freq=1, start_steps=10000, log_dir='logs/train', training=True, ): self.gamma = gamma self.polyak = polyak self.act_noise = act_noise self.render = render self.batch_size = batch_size self.p_lr = p_lr self.q_lr = q_lr self.max_episodes = max_episodes self.start_steps = start_steps self.actor, self.critic = create_actor_critic( env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high) self.target_actor, self.target_critic = create_actor_critic( env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) self.env = env self.rewards = [] self.print_freq = print_freq self.save_path = save_path if training: self.buffer = ReplayBuffer(buffer_capacity) self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=self.p_lr) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=self.q_lr) self.summary_writer = tf.summary.create_file_writer(log_dir) self.mse = tf.keras.losses.MeanSquaredError() if load_path is not None: self.actor.load_weights(f'{load_path}/actor') self.critic.load_weights(f'{load_path}/critic') @tf.function def train_step(self, states, actions, targets): with tf.GradientTape() as tape: action_predictions = self.actor(states) q_values = self.critic([states, action_predictions]) policy_loss = -tf.reduce_mean(q_values) actor_gradients = tape.gradient(policy_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) with tf.GradientTape() as tape: q_values = self.critic([states, actions]) mse_loss = self.mse(q_values, targets) critic_gradients = tape.gradient(mse_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_gradients, self.critic.trainable_variables)) with self.summary_writer.as_default(): tf.summary.scalar('Policy Loss', policy_loss, step=self.critic_optimizer.iterations) tf.summary.scalar('MSE Loss', mse_loss, step=self.critic_optimizer.iterations) tf.summary.scalar('Estimated Q Value', tf.reduce_mean(q_values), step=self.critic_optimizer.iterations) def update(self): if len(self.buffer) >= self.batch_size: # Sample random minibatch of N transitions states, actions, rewards, next_states, dones = self.buffer.sample( self.batch_size) dones = dones.reshape(-1, 1) rewards = rewards.reshape(-1, 1) # Set the target for learning target_action_preds = self.target_actor(next_states) target_q_values = self.target_critic( [next_states, target_action_preds]) targets = rewards + self.gamma * target_q_values * (1 - dones) # update critic by minimizing the MSE loss # update the actor policy using the sampled policy gradient self.train_step(states, actions, targets) # Update target networks polyak_average(self.actor.variables, self.target_actor.variables, self.polyak) polyak_average(self.critic.variables, self.target_critic.variables, self.polyak) def act(self, obs, noise=False): # Initialize a random process N for action exploration norm_dist = tf.random.normal(self.env.action_space.shape, stddev=self.act_noise) action = self.actor(np.expand_dims(obs, axis=0)) action = np.clip(action.numpy() + (norm_dist.numpy() if noise else 0), a_min=self.env.action_space.low, a_max=self.env.action_space.high) return action def learn(self): mean_reward = None total_steps = 0 overall_steps = 0 for ep in range(self.max_episodes): if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print( f"-------------------------------------------------------") print( f"Mean {self.print_freq} Episode Reward: {new_mean_reward}" ) print(f"Mean Steps: {total_steps / self.print_freq}") print(f"Total Episodes: {ep}") print(f"Total Steps: {overall_steps}") print( f"-------------------------------------------------------") total_steps = 0 with self.summary_writer.as_default(): tf.summary.scalar(f'Mean {self.print_freq} Episode Reward', new_mean_reward, step=ep) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward ) and self.save_path is not None: print( f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}" ) print(f'Location: {self.save_path}') mean_reward = new_mean_reward self.actor.save_weights(f'{self.save_path}/actor') self.critic.save_weights(f'{self.save_path}/critic') # Receive initial observation state s_1 obs = self.env.reset() done = False episode_reward = 0 ep_len = 0 while not done: # Display the environment if self.render: self.env.render() # Execute action and observe reward and observe new state if self.start_steps > 0: self.start_steps -= 1 action = self.env.action_space.sample() else: # Select action according to policy and exploration noise action = self.act(obs, noise=True).flatten() new_obs, rew, done, info = self.env.step(action) new_obs = new_obs.flatten() episode_reward += rew # Store transition in R self.buffer.add((obs, action, rew, new_obs, done)) # Perform a single learning step self.update() obs = new_obs ep_len += 1 with self.summary_writer.as_default(): tf.summary.scalar(f'Episode Reward', episode_reward, step=ep) self.rewards.append(episode_reward) total_steps += ep_len overall_steps += ep_len
def initialize(self, args): BaseModel.initialize(self, args) self.input_A = self.Tensor(args.batchSize, 3, 1024, 256) self.input_B = self.Tensor(args.batchSize, 3, 1024, 256) self.fake_A_Buffer = ReplayBuffer() self.fake_B_Buffer = ReplayBuffer() self.netG_AtoB = networks.define_G(3, 3, 64, 'resnet_9blocks', 'instance', False, args.init_type, self.gpu_ids) self.netG_BtoA = networks.define_G(3, 3, 64, 'resnet_9blocks', 'instance', False, args.init_type, self.gpu_ids) self.netD_A = networks.define_D(3, 64, 'basic', norm='instance', use_sigmoid=False, gpu_ids=args.gpu_ids) self.netD_B = networks.define_D(3, 64, 'basic', norm='instance', use_sigmoid=False, gpu_ids=args.gpu_ids) self.netG_AtoB.apply(weights_init_normal) self.netG_BtoA.apply(weights_init_normal) self.netD_A.apply(weights_init_normal) self.netD_B.apply(weights_init_normal) checkpoint_AtoB_filename = 'netG_A2B.pth' checkpoint_BtoA_filename = 'netG_B2A.pth' checkpoint_D_A_filename = 'netD_A.pth' checkpoint_D_B_filename = 'netD_B.pth' checkpoint_path_AtoB = os.path.join(args.checkpoints_dir, checkpoint_AtoB_filename) checkpoint_path_BtoA = os.path.join(args.checkpoints_dir, checkpoint_BtoA_filename) checkpoint_path_D_A = os.path.join(args.checkpoints_dir, checkpoint_D_A_filename) checkpoint_path_D_B = os.path.join(args.checkpoints_dir, checkpoint_D_B_filename) # Load checkpoint # self.netG_AtoB.load_state_dict(torch.load(checkpoint_path_AtoB)) # self.netG_BtoA.load_state_dict(torch.load(checkpoint_path_BtoA)) # self.netD_A.load_state_dict(torch.load(checkpoint_path_D_A)) # self.netD_B.load_state_dict(torch.load(checkpoint_path_D_B)) # define loss # self.criterionGAN = networks.GANLoss().to(self.device) self.criterionGAN = torch.nn.MSELoss().cuda() self.criterionCycle = torch.nn.L1Loss().cuda() self.criterionIdentity = torch.nn.L1Loss().cuda() # init optimizer self.optimizer_G = torch.optim.Adam(itertools.chain( self.netG_AtoB.parameters(), self.netG_BtoA.parameters()), lr=0.0001, betas=(0.5, 0.999)) self.optimizer_D_a = torch.optim.Adam(self.netD_A.parameters(), lr=0.0001, betas=(0.5, 0.999)) self.optimizer_D_b = torch.optim.Adam(self.netD_B.parameters(), lr=0.0001, betas=(0.5, 0.999)) self.lr_scheduler_G = torch.optim.lr_scheduler.LambdaLR( self.optimizer_G, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) self.lr_scheduler_D_a = torch.optim.lr_scheduler.LambdaLR( self.optimizer_D_a, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step) self.lr_scheduler_D_b = torch.optim.lr_scheduler.LambdaLR( self.optimizer_D_b, lr_lambda=LambdaLR(args.n_epochs, args.epoch, args.decay_epoch).step)
def main(): time_str = time.strftime("%Y%m%d-%H%M%S") print('time_str: ', time_str) exp_count = 0 if args.experiment == 'a|s': direc_name_ = '_'.join([args.env, args.experiment]) else: direc_name_ = '_'.join( [args.env, args.experiment, 'bp2VAE', str(args.bp2VAE)]) direc_name_exist = True while direc_name_exist: exp_count += 1 direc_name = '/'.join([direc_name_, str(exp_count)]) direc_name_exist = os.path.exists(direc_name) try: os.makedirs(direc_name) except OSError as e: if e.errno != errno.EEXIST: raise if args.tensorboard_dir is None: logger = Logger('/'.join([direc_name, time_str])) else: logger = Logger(args.tensorboard_dir) env = gym.make(args.env) if args.wrapper: if args.video_dir is None: args.video_dir = '/'.join([direc_name, 'videos']) env = gym.wrappers.Monitor(env, args.video_dir, force=True) print('observation_space: ', env.observation_space) print('action_space: ', env.action_space) env.seed(args.seed) torch.manual_seed(args.seed) if args.experiment == 'a|s': dim_x = env.observation_space.shape[0] elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' or \ args.experiment == 'a|z(a_prev, s, s_next)': dim_x = args.z_dim policy = ActorCritic(input_size=dim_x, hidden1_size=3 * dim_x, hidden2_size=6 * dim_x, action_size=env.action_space.n) if args.use_cuda: Tensor = torch.cuda.FloatTensor torch.cuda.manual_seed_all(args.seed) policy.cuda() else: Tensor = torch.FloatTensor policy_optimizer = optim.Adam(policy.parameters(), lr=args.policy_lr) if args.experiment != 'a|s': from util import ReplayBuffer, vae_loss_function dim_s = env.observation_space.shape[0] if args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)': from model import VAE vae = VAE(input_size=dim_s, hidden1_size=3 * args.z_dim, hidden2_size=args.z_dim) elif args.experiment == 'a|z(a_prev, s, s_next)': from model import CVAE vae = CVAE(input_size=dim_s, class_size=1, hidden1_size=3 * args.z_dim, hidden2_size=args.z_dim) if args.use_cuda: vae.cuda() vae_optimizer = optim.Adam(vae.parameters(), lr=args.vae_lr) if args.experiment == 'a|z(s)': from util import Transition_S2S as Transition elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': from util import Transition_S2SNext as Transition buffer = ReplayBuffer(args.buffer_capacity, Transition) update_vae = True if args.experiment == 'a|s': from util import Record_S elif args.experiment == 'a|z(s)': from util import Record_S2S elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': from util import Record_S2SNext def train_actor_critic(n): saved_info = policy.saved_info R = 0 cum_returns_ = [] for r in policy.rewards[::-1]: R = r + args.gamma * R cum_returns_.insert(0, R) cum_returns = Tensor(cum_returns_) cum_returns = (cum_returns - cum_returns.mean()) \ / (cum_returns.std() + np.finfo(np.float32).eps) cum_returns = Variable(cum_returns, requires_grad=False).unsqueeze(1) batch_info = SavedInfo(*zip(*saved_info)) batch_log_prob = torch.cat(batch_info.log_prob) batch_value = torch.cat(batch_info.value) batch_adv = cum_returns - batch_value policy_loss = -torch.sum(batch_log_prob * batch_adv) value_loss = F.smooth_l1_loss(batch_value, cum_returns, size_average=False) policy_optimizer.zero_grad() total_loss = policy_loss + value_loss total_loss.backward() policy_optimizer.step() if args.use_cuda: logger.scalar_summary('value_loss', value_loss.data.cpu()[0], n) logger.scalar_summary('policy_loss', policy_loss.data.cpu()[0], n) all_value_loss.append(value_loss.data.cpu()[0]) all_policy_loss.append(policy_loss.data.cpu()[0]) else: logger.scalar_summary('value_loss', value_loss.data[0], n) logger.scalar_summary('policy_loss', policy_loss.data[0], n) all_value_loss.append(value_loss.data[0]) all_policy_loss.append(policy_loss.data[0]) del policy.rewards[:] del policy.saved_info[:] if args.experiment != 'a|s': def train_vae(n): train_times = (n // args.vae_update_frequency - 1) * args.vae_update_times for i in range(args.vae_update_times): train_times += 1 sample = buffer.sample(args.batch_size) batch = Transition(*zip(*sample)) state_batch = torch.cat(batch.state) if args.experiment == 'a|z(s)': recon_batch, mu, log_var = vae.forward(state_batch) mse_loss, kl_loss = vae_loss_function( recon_batch, state_batch, mu, log_var, logger, train_times, kl_discount=args.kl_weight, mode=args.experiment) elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': next_state_batch = Variable(torch.cat(batch.next_state), requires_grad=False) predicted_batch, mu, log_var = vae.forward(state_batch) mse_loss, kl_loss = vae_loss_function( predicted_batch, next_state_batch, mu, log_var, logger, train_times, kl_discount=args.kl_weight, mode=args.experiment) vae_loss = mse_loss + kl_loss vae_optimizer.zero_grad() vae_loss.backward() vae_optimizer.step() logger.scalar_summary('vae_loss', vae_loss.data[0], train_times) all_vae_loss.append(vae_loss.data[0]) all_mse_loss.append(mse_loss.data[0]) all_kl_loss.append(kl_loss.data[0]) # To store cum_reward, value_loss and policy_loss from each episode all_cum_reward = [] all_last_hundred_average = [] all_value_loss = [] all_policy_loss = [] if args.experiment != 'a|s': # Store each vae_loss calculated all_vae_loss = [] all_mse_loss = [] all_kl_loss = [] for episode in count(1): done = False state_ = torch.Tensor([env.reset()]) cum_reward = 0 if args.experiment == 'a|z(a_prev, s, s_next)': action = random.randint(0, 2) state_, reward, done, info = env.step(action) cum_reward += reward state_ = torch.Tensor([np.append(state_, action)]) while not done: if args.experiment == 'a|s': state = Variable(state_, requires_grad=False) elif args.experiment == 'a|z(s)' or args.experiment == 'a|z(s, s_next)' \ or args.experiment == 'a|z(a_prev, s, s_next)': state_ = Variable(state_, requires_grad=False) mu, log_var = vae.encode(state_) if args.bp2VAE and update_vae: state = vae.reparametrize(mu, log_var) else: state = vae.reparametrize(mu, log_var).detach() action_ = policy.select_action(state) if args.use_cuda: action = action_.cpu()[0, 0] else: action = action_[0, 0] next_state_, reward, done, info = env.step(action) next_state_ = torch.Tensor([next_state_]) cum_reward += reward if args.render: env.render() policy.rewards.append(reward) if args.experiment == 'a|z(s)': buffer.push(state_) elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': if not done: buffer.push(state_, next_state_) if args.experiment == 'a|z(a_prev, s, s_next)': next_state_ = torch.cat( [next_state_, torch.Tensor([action])], 1) state_ = next_state_ train_actor_critic(episode) last_hundred_average = sum(all_cum_reward[-100:]) / 100 logger.scalar_summary('cum_reward', cum_reward, episode) logger.scalar_summary('last_hundred_average', last_hundred_average, episode) all_cum_reward.append(cum_reward) all_last_hundred_average.append(last_hundred_average) if update_vae: if args.experiment != 'a|s' and episode % args.vae_update_frequency == 0: assert len(buffer) >= args.batch_size train_vae(episode) if len(all_vae_loss) > 1000: if abs( sum(all_vae_loss[-500:]) / 500 - sum(all_vae_loss[-1000:-500]) / 500) < args.vae_update_threshold: update_vae = False if episode % args.log_interval == 0: print( 'Episode {}\tLast cum return: {:5f}\t100-episodes average cum return: {:.2f}' .format(episode, cum_reward, last_hundred_average)) if episode > args.num_episodes: print("100-episodes average cum return is now {} and " "the last episode runs to {} time steps!".format( last_hundred_average, cum_reward)) env.close() torch.save(policy, '/'.join([direc_name, 'model'])) if args.experiment == 'a|s': record = Record_S( policy_loss=all_policy_loss, value_loss=all_value_loss, cum_reward=all_cum_reward, last_hundred_average=all_last_hundred_average) elif args.experiment == 'a|z(s)': record = Record_S2S( policy_loss=all_policy_loss, value_loss=all_value_loss, cum_reward=all_cum_reward, last_hundred_average=all_last_hundred_average, mse_recon_loss=all_mse_loss, kl_loss=all_kl_loss, vae_loss=all_vae_loss) elif args.experiment == 'a|z(s, s_next)' or args.experiment == 'a|z(a_prev, s, s_next)': record = Record_S2SNext( policy_loss=all_policy_loss, value_loss=all_value_loss, cum_reward=all_cum_reward, last_hundred_average=all_last_hundred_average, mse_pred_loss=all_mse_loss, kl_loss=all_kl_loss, vae_loss=all_vae_loss) pickle.dump(record, open('/'.join([direc_name, 'record']), 'wb')) break