def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = DeterministicPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target = DeterministicPolicy( state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 if self.args.last_episode > 0: self.load(self.args.last_episode)
def __init__(self, params): super(BCAgent, self).__init__(params) # Initialize policy network pol_params = self.params['p-bc']['pol_params'] pol_params['input_size'] = self.N pol_params['output_size'] = self.M if 'final_activation' not in pol_params: pol_params['final_activation'] = torch.tanh self.pol = MLP(pol_params) # Create policy optimizer ppar = self.params['p-bc']['pol_optim'] self.pol_optim = torch.optim.Adam(self.pol.parameters(), lr=ppar['lr'], weight_decay=ppar['reg']) # Use a replay buffer that will save planner actions self.pol_buf = ReplayBuffer(self.N, self.M, self.params['p-bc']['buf_size']) # Logging (store cum_rew, cum_emp_rew) self.hist['pols'] = np.zeros((self.T, 2)) self.has_pol = True self.pol_cache = ()
def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 self.actionCounter = np.zeros((env.width, env.height, env.num_actions)) # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net)
def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = GaussianPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(), self.args.lr) self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_1.load_state_dict(self.critic_1.state_dict()) self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(), self.args.lr) self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_2.load_state_dict(self.critic_2.state_dict()) self.replay_buffer = ReplayBuffer(self.args.capacity) self.global_steps = 0
def __init__(self, network, prep, exp_policy, state_dim, action_dim, name, learning_rate=1e-3, hard_update_frequency=500, soft_update_rate=None, buffer_size=50000, batch_size=32, num_steps=200000, discount=0.99, use_huber_loss=True, detailed_summary=False, max_reward=200, steps_before_learn=1000, train_freq=1, save_end=True): self.network = network self.prep = prep self.exp_policy = exp_policy self.greedy_policy = policy.Greedy() self.state_dim = state_dim self.action_dim = action_dim self.discount = discount self.summary_dir = os.path.join(name, "summary") self.use_huber_loss = use_huber_loss self.detailed_summary = detailed_summary self.learning_rate = learning_rate self.batch_size = batch_size self.hard_update_frequency = hard_update_frequency self.soft_update_rate = soft_update_rate self.num_steps = num_steps self.step = 0 self.steps_before_learn = steps_before_learn self.train_freq = train_freq self.solved = False self.max_reward = max_reward self.save_end = save_end self.actions = None self.rewards = None self.done = None self.action_q_values = None self.max_target_q_values = None self.targets = None self.global_step = None self.inc_global_step = None self.train_op = None self.states = None self.q_values = None self.next_states = None self.target_q_values = None self.target_update = None self.build_all() self.merged = tf.summary.merge_all() self.session = tf.Session() self.summary_dir = utils.new_summary_dir(self.summary_dir) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session.run(init_op) self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim)
def __init__(self, features, actions, state_array, params): super(DQN, self).__init__(features, actions , params) self.buffer_BACK = ReplayBuffer(1000) self.buffer_STAY = ReplayBuffer(1000) self.buffer_FORWARD = ReplayBuffer(1000) self.back_values = [] self.stay_values = [] self.forward_values = [] self.td_loss = [] self.state_array = state_array
def __init__(self, args): self.args = args self.policy = [Q_net(args) for _ in range(args.n_agents)] self.hyperNet = HyperNet(args) self.policy_target = [copy.deepcopy(p) for p in self.policy] self.hyperNet_target = copy.deepcopy(self.hyperNet) self.replayBuffer = ReplayBuffer(args) self.preference_pool = Preference(args) policy_param = [policy.parameters() for policy in self.policy] self.optim = torch.optim.Adam(itertools.chain( *policy_param, self.hyperNet.parameters()), lr=self.args.learning_rate) self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size=10, gamma=0.95, last_epoch=-1)
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ super().__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_target = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_target = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #Statistics self.stats = { "actor_loss": [], "critic_loss": [], "reward_sum": [], }
def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector): self.features = features self.actions = actions self.params = params self.collector = collector self.seed = seed # define parameter contract self.gamma = params['gamma'] self.epsilon = params.get('epsilon', 0) # the mellowmax parameter self.omega = params.get('omega', 1.0) # set up network for estimating Q(s, a) self.value_net = Network(features, actions, params, seed).to(device) # build the optimizer self.optimizer_params = params['optimizer'] self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params) self.steps = 0 # set up the replay buffer self.buffer_size = params['buffer_size'] self.batch_size = params['batch'] self.buffer_type = params.get('buffer', 'standard') if self.buffer_type == 'per': prioritization = params['prioritization'] self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization) else: self.buffer = ReplayBuffer(self.buffer_size) # build a target network self.target_refresh = params.get('target_refresh', 1) self.target_net = copy.deepcopy(self.value_net) self.initializeTargetNet() def getValues(x: torch.Tensor): qs = self.values(x).detach().cpu().squeeze(0).numpy() return qs self.policy = createEpsilonGreedy(seed, self.epsilon, getValues)
def __init__(self, params): super(POLOAgent, self).__init__(params) self.H_backup = self.params['polo']['H_backup'] # Create ensemble of value functions model_params = params['polo']['ens_params']['model_params'] model_params['input_size'] = self.N model_params['output_size'] = 1 params['polo']['ens_params']['dtype'] = self.dtype params['polo']['ens_params']['device'] = self.device self.val_ens = Ensemble(self.params['polo']['ens_params']) # Learn from replay buffer self.polo_buf = ReplayBuffer(self.N, self.M, self.params['polo']['buf_size']) # Value (from forward), value mean, value std self.hist['vals'] = np.zeros((self.T, 3))
def __init__(self, args, env = None): self.args = args # actor self.actor = DeterministicPolicy(128).to(device) self.actor_target = DeterministicPolicy(128).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) # critics self.critic = QNetwork(128).to(device) self.critic_target = QNetwork(128).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 self.action_scale = torch.FloatTensor([[20, 1]]).to(device) self.env = env
def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) self.det_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net.load_state_dict( torch.load( "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt" )) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net)
def init_run(self): self.log("Starting init") self.r_sum = 0 if self.state_rep == SQUARE: self.state_proc = SquareAroundHeadState(radius=self.state_radius, step_forward=self.step_forward, flatten=self.flatten) elif self.state_rep == DIAMOND: self.state_proc = DiamondAroundHeadState(radius=self.state_radius, step_forward=self.step_forward, flatten=self.flatten) elif self.state_rep == RADAR: self.state_proc = RadarState(num_per_type=NUM_PER_TYPE) elif self.state_rep == RADAR_PLUS: self.state_proc = DoubleStateWrapper( SquareAroundHeadState(radius=self.state_radius, step_forward=self.step_forward, flatten=self.flatten), RadarState(num_per_type=NUM_PER_TYPE)) self.input_shape = self.state_proc.get_shape() self.model = self._build_model() self.model.summary() if self.huber_loss: loss = huber_loss else: loss = 'mse' opt = Adam(self.learning_rate) self.model.compile(loss=loss, optimizer=opt) self.old_model = keras.models.clone_model(self.model) self._save_model() self.memory = ReplayBuffer(BUFFER_SIZE) self.log("Init finished!") self.num_of_samples = 0 self.sum_of_loss = 0
def __init__(self, features, actions, params): super(DQN, self).__init__(features, actions, params) self.buffer_BACK = ReplayBuffer(1000) self.buffer_STAY = ReplayBuffer(1000) self.buffer_FORWARD = ReplayBuffer(1000) self.back_values = [] self.stay_values = [] self.forward_values = [] self.ratioMap = params['ratioMap'] self.sampleSize = params['sampleSize']
def __init__(self, features, actions, state_array, params): super(DQN, self).__init__(features, actions, params) self.buffer_BACK = ReplayBuffer(1000) self.buffer_STAY = ReplayBuffer(1000) self.buffer_FORWARD = ReplayBuffer(1000) self.back_q_net = Network(features, self.h1, self.h2, 1).to(device) self.back_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.back_q_net.cloneWeightsTo(self.back_target_q_net) self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device) self.stay_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.stay_q_net.cloneWeightsTo(self.stay_target_q_net) self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device) self.forward_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.forward_q_net.cloneWeightsTo(self.forward_target_q_net) self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.back_values = [] self.stay_values = [] self.forward_values = [] self.back_values_baseline = [] self.stay_values_baseline = [] self.forward_values_baseline = [] self.td_loss = [] self.state_array = state_array self.penultimate_features = [] self.ratioMap = params['ratioMap'] self.sampleSize = params['sampleSize']
def __init__(self, prep, build, policy, state_dim, action_dim, monitor_directory, buffer_size=10000, batch_size=32, steps_before_train=100, train_freq=1, num_steps=1000000, learning_rate=1e-3, update_rate=1e-3, max_reward=None, detailed_summary=False): self.prep = prep self.build_mode = build self.policy = policy self.state_dim = state_dim self.action_dim = action_dim self.summary_dir = os.path.join(monitor_directory, "summary") self.detailed_summary = detailed_summary self.discount = 0.99 self.learning_rate = learning_rate self.target_update_rate = update_rate self.buffer_size = buffer_size self.batch_size = batch_size self.steps_before_train = steps_before_train self.train_freq = train_freq self.max_reward = max_reward self.max_iters = num_steps self.step = 0 self.solved = False self.state_layers = [64, 32] self.mu_layers = [16, 8, self.action_dim] self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2] self.v_layers = [16, 8, 1] self.action_inputs = None self.reward_inputs = None self.done = None self.state_inputs = None self.state_outputs = None self.mu_outputs = None self.l_outputs = None self.value_outputs = None self.next_state_inputs = None self.next_state_outputs = None self.target_value_outputs = None self.target = None self.advantages = None self.q_values = None self.loss = None self.global_step = None self.inc_global_step = None self.train_op = None self.target_update = None self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.build() self.merged = tf.summary.merge_all() self.session = tf.Session() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "learning rate": self.learning_rate, "batch size": self.batch_size, "update rate": self.target_update_rate, "buffer size": self.buffer_size, "build": self.build_mode.name, "train frequency": self.train_freq }) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session.run(init_op)
class SAC(algorithms): def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = GaussianPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_1 = optim.Adam(self.critic_1.parameters(), self.args.lr) self.critic_target_1 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_1.load_state_dict(self.critic_1.state_dict()) self.critic_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_optimizer_2 = optim.Adam(self.critic_2.parameters(), self.args.lr) self.critic_target_2 = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target_2.load_state_dict(self.critic_2.state_dict()) self.replay_buffer = ReplayBuffer(self.args.capacity) self.global_steps = 0 def update(self): for it in range(self.args.update_iteration): # sample from replay buffer x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) state = torch.FloatTensor(x).to(device) action = torch.FloatTensor(u).to(device) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # get the next action and compute target Q with torch.no_grad(): next_action, log_prob, _ = self.actor.sample(next_state) target_Q1 = self.critic_target_1(next_state, next_action) target_Q2 = self.critic_target_2(next_state, next_action) target_Q = torch.min(target_Q1, target_Q2) - self.args.alpha * log_prob y_Q = reward + self.args.gamma * (1 - done) * target_Q # update critic current_Q1 = self.critic_1(state, action) critic_loss1 = F.mse_loss(current_Q1, y_Q) self.critic_optimizer_1.zero_grad() critic_loss1.backward() self.critic_optimizer_1.step() current_Q2 = self.critic_2(state, action) critic_loss2 = F.mse_loss(current_Q2, y_Q) self.critic_optimizer_2.zero_grad() critic_loss2.backward() self.critic_optimizer_2.step() # update actor actor_action, actor_log_prob, _ = self.actor.sample(state) Q1 = self.critic_1(state, actor_action) Q2 = self.critic_2(state, actor_action) actor_loss = -(torch.min(Q1, Q2) - self.args.alpha * actor_log_prob).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target network for param, target_param in zip(self.critic_1.parameters(), self.critic_target_1.parameters()): target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) for param, target_param in zip(self.critic_2.parameters(), self.critic_target_2.parameters()): target_param.data.copy_((1 - self.args.tau) * target_param.data + self.args.tau * param.data) def train(self): for i in range(self.args.max_episode): state = self.env.reset() ep_r = 0 for t in count(): action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] next_state, reward, done, info = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push( (state, next_state, action, reward, np.float(done))) state = next_state if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print( "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}" .format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) ep_r = 0 break if len(self.replay_buffer.storage) >= self.args.capacity - 1: self.update() self.save(i + 1) def evaluate(self, number=1, render=True): rewards = [] for _ in range(number): state = self.env.reset() done = False total_rews = 0 time_step = 0 while not done: with torch.no_grad(): # use the mean action action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] if render: self.env.render() state, reward, done, _ = self.env.step(action) total_rews += reward time_step += 1 if render: print("total reward of this episode is " + str(total_rews)) rewards.append(total_rews) rewards = np.array(rewards) if not render: pickle.dump((self.global_steps, rewards), self.log_file) return rewards.max(), rewards.min(), rewards.mean() def save(self, episode): file_name = self.weights_file(episode) torch.save( { 'actor': self.actor.state_dict(), 'critic_1': self.critic_1.state_dict(), 'critic_2': self.critic_2.state_dict(), 'critic_target_1': self.critic_target_1.state_dict(), 'critic_target_2': self.critic_target_2.state_dict() }, file_name) print("save model to " + file_name) def load(self, episode): file_name = self.weights_file(episode) checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.critic_1.load_state_dict(checkpoint['critic_1']) self.critic_2.load_state_dict(checkpoint['critic_2']) self.critic_target_1.load_state_dict(checkpoint['critic_target_1']) self.critic_target_2.load_state_dict(checkpoint['critic_target_2']) print("successfully load model from " + file_name)
MINI_BATCH, TAU, 0.001, L2C) ################## # graph auxiliries ################## saver = tf.train.Saver() init = tf.initialize_all_variables() summary = tf.merge_all_summaries() logger = tf.train.SummaryWriter(OUT_DIR, sess.graph) # initialize mdp state structure mdp = MDP_state(STATE_SIZE, FRAMES) # initialize replay buffer R = ReplayBuffer(MDP_STATE_SIZE, ACTION_SIZE, BUFFER_SIZE) buf = R.LoadBuffer(OUT_DIR + BUFFER_FILE) if buf: EXP_PROB = EPSILON populated = R.GetOccupency() print("Replay buffer loaded from disk, occupied: " + str(populated)) else: print("Creating new replay buffer") # load saved model ckpt = tf.train.get_checkpoint_state(OUT_DIR) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model loaded from disk") # define action discretization
class DQN(BaseAgent): def __init__(self, features, actions, state_array, params): super(DQN, self).__init__(features, actions, params) self.buffer_BACK = ReplayBuffer(1000) self.buffer_STAY = ReplayBuffer(1000) self.buffer_FORWARD = ReplayBuffer(1000) self.back_q_net = Network(features, self.h1, self.h2, 1).to(device) self.back_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.back_q_net.cloneWeightsTo(self.back_target_q_net) self.stay_q_net = Network(features, self.h1, self.h2, 1).to(device) self.stay_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.stay_q_net.cloneWeightsTo(self.stay_target_q_net) self.forward_q_net = Network(features, self.h1, self.h2, 1).to(device) self.forward_target_q_net = Network( features, self.h1, self.h2, 1).to(device) self.forward_q_net.cloneWeightsTo(self.forward_target_q_net) self.optimizerBack = torch.optim.Adam(self.back_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerStay = torch.optim.Adam(self.stay_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.optimizerForward = torch.optim.Adam(self.forward_q_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) self.back_values = [] self.stay_values = [] self.forward_values = [] self.back_values_baseline = [] self.stay_values_baseline = [] self.forward_values_baseline = [] self.td_loss = [] self.state_array = state_array self.penultimate_features = [] self.ratioMap = params['ratioMap'] self.sampleSize = params['sampleSize'] def updateNetwork(self, samples): # organize the mini-batch so that we can request "columns" from the data # e.g. we can get all of the actions, or all of the states with a single call batch = getBatchColumns(samples) # compute Q(s, a) for each sample in mini-batch Qs, x = self.policy_net(batch.states) Qsa = Qs.gather(1, batch.actions).squeeze() self.penultimate_features.append(x) # by default Q(s', a') = 0 unless the next states are non-terminal Qspap = torch.zeros(batch.size, device=device) # for i in range(len(batch.actions.numpy())): # if batch.actions.numpy()[i][0] == 0: # self.back_values.append(Qsa.detach().numpy()[i]) # elif batch.actions.numpy()[i][0] == 1: # self.stay_values.append(Qsa.detach().numpy()[i]) # elif batch.actions.numpy()[i][0] == 2: # self.forward_values.append(Qsa.detach().numpy()[i]) # if we don't have any non-terminal next states, then no need to bootstrap if batch.nterm_sp.shape[0] > 0: Qsp, _ = self.target_net(batch.nterm_sp) # bootstrapping term is the max Q value for the next-state # only assign to indices where the next state is non-terminal Qspap[batch.nterm] = Qsp.max(1).values # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize # don't worry about detaching the bootstrapping term for semi-gradient Q-learning # the target network handles that target = batch.rewards + batch.gamma * Qspap.detach() td_loss = 0.5 * f.mse_loss(target, Qsa) # make sure we have no gradients left over from previous update self.optimizer.zero_grad() self.target_net.zero_grad() # compute the entire gradient of the network using only the td error td_loss.backward() self.td_loss.append(td_loss.detach().numpy()) # self.td_loss = self.td_loss + list(td_loss.detach().numpy()) Qs_state_array, _ = self.policy_net(self.state_array) Qsa_mean_states = torch.mean(Qs_state_array, 0) self.back_values.append(Qsa_mean_states[0].detach().numpy()) self.stay_values.append(Qsa_mean_states[1].detach().numpy()) self.forward_values.append(Qsa_mean_states[2].detach().numpy()) # update the *policy network* using the combined gradients self.optimizer.step() def updateActionNet(self, samples, q_net, target_q_net, optimizer, storeList): batch = getBatchColumns(samples) Qs, x = q_net(batch.states) # Qsa = Qs.squeeze() # for i in range(len(batch.actions)): # storeList.append(Qsa.detach().numpy()[i]) Qspap = torch.zeros(batch.size, device=device) ############ ============ CHECK ================= ############################### if batch.nterm_sp.shape[0] > 0: ## Qsp, _ = target_q_net(batch.nterm_sp) #### Is this correct ???? Qsp_back, _ = self.back_target_q_net(batch.nterm_sp) Qsp_stay, _ = self.stay_target_q_net(batch.nterm_sp) Qsp_forward, _ = self.forward_target_q_net(batch.nterm_sp) Qsp = torch.hstack([Qsp_back, Qsp_stay, Qsp_forward]) # bootstrapping term is the max Q value for the next-state # only assign to indices where the next state is non-terminal Qspap[batch.nterm] = Qsp.max(1).values ############ ============ CHECK ================= ############################### # compute the empirical MSBE for this mini-batch and let torch auto-diff to optimize # don't worry about detaching the bootstrapping term for semi-gradient Q-learning # the target network handles that target = batch.rewards + batch.gamma * Qspap.detach() td_loss = 0.5 * f.mse_loss(target, Qsa) # make sure we have no gradients left over from previous update optimizer.zero_grad() target_q_net.zero_grad() self.back_target_q_net.zero_grad() self.stay_target_q_net.zero_grad() self.forward_target_q_net.zero_grad() # compute the entire gradient of the network using only the td error td_loss.backward() Qs_state_array, _ = q_net(self.state_array) Qsa_mean_states = torch.mean(Qs_state_array, 0) storeList.append(Qsa_mean_states[0].detach().numpy()) # update the *policy network* using the combined gradients optimizer.step() def update(self, s, a, sp, r, gamma): if a.cpu().numpy() == 0: self.buffer_BACK.add((s, a, sp, r, gamma)) elif a.cpu().numpy() == 1: self.buffer_STAY.add((s, a, sp, r, gamma)) elif a.cpu().numpy() == 2: self.buffer_FORWARD.add((s, a, sp, r, gamma)) # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) self.back_q_net.cloneWeightsTo(self.back_target_q_net) self.stay_q_net.cloneWeightsTo(self.stay_target_q_net) self.forward_q_net.cloneWeightsTo(self.forward_target_q_net) back_sample_count = math.floor( self.ratioMap.backward_ratio * self.sampleSize) stay_sample_count = math.floor( self.ratioMap.stay_ratio * self.sampleSize) forward_sample_count = math.floor( self.ratioMap.forward_ratio * self.sampleSize) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer_BACK) > back_sample_count \ and len(self.buffer_STAY) > stay_sample_count \ and len(self.buffer_FORWARD) > forward_sample_count: samplesBack, idcs = self.buffer_BACK.sample(back_sample_count) samplesStay, idcs = self.buffer_STAY.sample(stay_sample_count) samplesForward, idcs = self.buffer_FORWARD.sample(forward_sample_count) self.updateActionNet(samplesBack, self.back_q_net, self.back_target_q_net, self.optimizerBack, self.back_values_baseline) self.updateActionNet(samplesStay, self.stay_q_net, self.stay_target_q_net, self.optimizerStay, self.stay_values_baseline) self.updateActionNet(samplesForward, self.forward_q_net, self.forward_target_q_net, self.optimizerForward, self.forward_values_baseline) samples = samplesBack + samplesStay + samplesForward self.updateNetwork(samples)
class BCAgent(POLOAgent): """ An agent extending upon POLO that uses behavior cloning on the planner predicted actions as a prior to MPC. """ def __init__(self, params): super(BCAgent, self).__init__(params) # Initialize policy network pol_params = self.params['p-bc']['pol_params'] pol_params['input_size'] = self.N pol_params['output_size'] = self.M if 'final_activation' not in pol_params: pol_params['final_activation'] = torch.tanh self.pol = MLP(pol_params) # Create policy optimizer ppar = self.params['p-bc']['pol_optim'] self.pol_optim = torch.optim.Adam(self.pol.parameters(), lr=ppar['lr'], weight_decay=ppar['reg']) # Use a replay buffer that will save planner actions self.pol_buf = ReplayBuffer(self.N, self.M, self.params['p-bc']['buf_size']) # Logging (store cum_rew, cum_emp_rew) self.hist['pols'] = np.zeros((self.T, 2)) self.has_pol = True self.pol_cache = () def get_action(self): """ BCAgent generates a planned trajectory using the behavior-cloned policy and then optimizes it via MPC. """ self.pol.eval() # Run a rollout using the policy starting from the current state infos = self.get_traj_info() self.hist['pols'][self.time] = infos[3:5] self.pol_cache = (infos[0], infos[2]) self.prior_actions = infos[1] # Generate trajectory via MPC with the prior actions as a prior action = super(BCAgent, self).get_action(prior=self.prior_actions) # Add final planning trajectory to BC buffer fin_states, fin_rews = self.cache[2], self.cache[3] fin_states = np.concatenate(([self.prev_obs], fin_states[1:])) pb_pct = self.params['p-bc']['pb_pct'] pb_len = int(pb_pct * fin_states.shape[0]) for t in range(pb_len): self.pol_buf.update(fin_states[t], fin_states[t + 1], fin_rews[t], self.planned_actions[t], False) return action def do_updates(self): """ Learn from the saved buffer of planned actions. """ super(BCAgent, self).do_updates() if self.time % self.params['p-bc']['update_freq'] == 0: self.update_pol() def update_pol(self): """ Update the policy via BC on the planner actions. """ self.pol.train() params = self.params['p-bc'] # Generate batches for training size = min(self.pol_buf.size, self.pol_buf.total_in) num_inds = params['batch_size'] * params['grad_steps'] inds = np.random.randint(0, size, size=num_inds) states = self.pol_buf.buffer['s'][inds] acts = self.pol_buf.buffer['a'][inds] states = torch.tensor(states, dtype=self.dtype) actions = torch.tensor(acts, dtype=self.dtype) for i in range(params['grad_steps']): bi, ei = i * params['batch_size'], (i + 1) * params['batch_size'] # Train based on L2 distance between actions and predictions preds = self.pol.forward(states[bi:ei]) preds = torch.squeeze(preds, dim=-1) targets = torch.squeeze(actions[bi:ei], dim=-1) loss = torch.nn.functional.mse_loss(preds, targets) self.pol_optim.zero_grad() loss.backward() self.pol_optim.step() def get_traj_info(self): """ Run the policy for a full trajectory and return details about the trajectory. """ env_state = self.env.sim.get_state() if self.mujoco else None infos = traj.eval_traj(copy.deepcopy(self.env), env_state, self.prev_obs, mujoco=self.mujoco, perturb=self.perturb, H=self.H, gamma=self.gamma, act_mode='deter', pt=(self.pol, 0), terminal=self.val_ens, tvel=self.tvel) return infos def print_logs(self): """ BC-specific logging information. """ bi, ei = super(BCAgent, self).print_logs() self.print('BC metrics', mode='head') self.print('policy traj rew', self.hist['pols'][self.time - 1][0]) self.print('policy traj emp rew', self.hist['pols'][self.time - 1][1]) return bi, ei def test_policy(self): """ Run the BC action selection mechanism. """ env = copy.deepcopy(self.env) obs = env.reset() if self.tvel is not None: env.set_target_vel(self.tvel) obs = env._get_obs() env_state = env.sim.get_state() if self.mujoco else None infos = traj.eval_traj(env, env_state, obs, mujoco=self.mujoco, perturb=self.perturb, H=self.eval_len, gamma=1, act_mode='deter', pt=(self.pol, 0), tvel=self.tvel) self.hist['pol_test'][self.time] = infos[3]
def __init__(self, state_dim, action_dim, monitor_directory, actor_learning_rate=1e-5, critic_learning_rate=1e-3, critic_target_update_rate=1e-3, actor_target_update_rate=1e-3, discount=0.99, l2_decay=1e-2, buffer_size=1000000, batch_size=64, detail_summary=False, tanh_action=True, input_batch_norm=True, all_batch_norm=True, log_frequency=10): self.state_dim = state_dim self.action_dim = action_dim self.critic_learning_rate = critic_learning_rate self.actor_learning_rate = actor_learning_rate self.critic_target_update_rate = critic_target_update_rate self.actor_target_update_rate = actor_target_update_rate self.discount = discount self.batch_size = batch_size self.l2_decay = l2_decay self.buffer_size = buffer_size self.summary_dir = os.path.join(monitor_directory, "summary") self.detail_summary = detail_summary self.tanh_action = tanh_action self.input_batch_norm = input_batch_norm self.all_batch_norm = all_batch_norm self.log_frequency = log_frequency self.step = 0 self.solved = False self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.__build() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "actor learning rate": self.actor_learning_rate, "critic learning rate": self.critic_learning_rate, "batch size": self.batch_size, "actor update rate": self.actor_target_update_rate, "critic update rate": self.critic_target_update_rate, "buffer size": self.buffer_size, }) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session = tf.Session() self.merged = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.session.run(init_op)
class BaseAgent: def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) self.det_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net = Network(features, self.h1, self.h2, actions).to(device) self.bpolicy_net.load_state_dict( torch.load( "/home/soumyadeep/Action_Imbalance/RLGTD/experiments/prediction_SARSA/agents/net_params.pt" )) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net) def selectAction(self, x): # take a random action about epsilon percent of the time q_s, _ = self.bpolicy_net(x) if q_s.shape[0] == 3: q_s = q_s.unsqueeze(0) #act = q_s.argmax().detach() # else: act = torch.max(q_s, 1).indices.detach().numpy() for i in range(act.shape[0]): action = act[i] if action == 1: if np.random.rand() < self.epsilon: act[i] = np.random.choice([0, 2]) # if act.cpu().numpy() == 1: # if np.random.rand() < self.epsilon: # a = np.random.randint(self.actions-1) # if np.random.rand() < self.epsilon: # a = np.random.randint(self.actions) # return torch.tensor(a, device=device) # # otherwise take a greedy action # q_s, _ = self.bpolicy_net(x) # # print(q_s) # return q_s.argmax().detach() act_tensor = torch.from_numpy(act).detach().to(device) return act_tensor def updateNetwork(self, samples): pass def update(self, s, a, sp, r, gamma): # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer) > 200: samples, idcs = self.buffer.sample(200) self.updateNetwork(samples)
class BaseAgent: def __init__(self, features, actions, params): self.features = features self.actions = actions self.params = params # define parameter contract self.alpha = params['alpha'] self.epsilon = params['epsilon'] self.target_refresh = params['target_refresh'] self.buffer_size = params['buffer_size'] self.h1 = params['h1'] self.h2 = params['h2'] # build two networks, one for the "online" learning policy # the other as a fixed target network self.policy_net = Network(features, self.h1, self.h2, actions).to(device) self.target_net = Network(features, self.h1, self.h2, actions).to(device) # build the optimizer for _only_ the policy network # target network parameters will be copied from the policy net periodically self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.alpha, betas=(0.9, 0.999)) # self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') # a simple circular replay buffer (i.e. a FIFO buffer) self.buffer = ReplayBuffer(self.buffer_size) self.steps = 0 self.actionCounter = np.zeros((env.width, env.height, env.num_actions)) # initialize the weights of the target network to match the weights of policy network self.policy_net.cloneWeightsTo(self.target_net) def selectAction(self, x): # take a random action about epsilon percent of the time if np.random.rand() < self.epsilon: a = np.random.randint(self.actions) return torch.tensor(a, device=device) # otherwise take a greedy action q_s, _ = self.policy_net(x) # print(q_s.detach().numpy()[0][3]) print(q_s.argmax().detach()) return q_s.argmax().detach() def updateNetwork(self, samples): pass def update(self, s, a, r, sp, gamma): # the "online" sample gets tossed into the replay buffer self.buffer.add((s, a, r, sp, gamma)) self.steps += 1 a = a.numpy() s = s.numpy() self.actionCounter[s[0][0]][s[0][1]][a] += 1 # if it is time to set the target net <- policy network # do that before the learning step if self.steps % self.target_refresh == 0: self.policy_net.cloneWeightsTo(self.target_net) # as long as we have enough samples in the buffer to do one mini-batch update # go ahead and randomly sample a mini-batch and do a single update if len(self.buffer) > 32: samples, idcs = self.buffer.sample(32) self.updateNetwork(samples)
class NAF: MODEL_NAME = "NAF" TARGET_MODEL_NAME = "target-NAF" class Build(Enum): SINGLE = 1 MULTIPLE = 2 HYDRA = 3 def __init__(self, prep, build, policy, state_dim, action_dim, monitor_directory, buffer_size=10000, batch_size=32, steps_before_train=100, train_freq=1, num_steps=1000000, learning_rate=1e-3, update_rate=1e-3, max_reward=None, detailed_summary=False): self.prep = prep self.build_mode = build self.policy = policy self.state_dim = state_dim self.action_dim = action_dim self.summary_dir = os.path.join(monitor_directory, "summary") self.detailed_summary = detailed_summary self.discount = 0.99 self.learning_rate = learning_rate self.target_update_rate = update_rate self.buffer_size = buffer_size self.batch_size = batch_size self.steps_before_train = steps_before_train self.train_freq = train_freq self.max_reward = max_reward self.max_iters = num_steps self.step = 0 self.solved = False self.state_layers = [64, 32] self.mu_layers = [16, 8, self.action_dim] self.l_layers = [16, 8, (self.action_dim * (self.action_dim + 1)) / 2] self.v_layers = [16, 8, 1] self.action_inputs = None self.reward_inputs = None self.done = None self.state_inputs = None self.state_outputs = None self.mu_outputs = None self.l_outputs = None self.value_outputs = None self.next_state_inputs = None self.next_state_outputs = None self.target_value_outputs = None self.target = None self.advantages = None self.q_values = None self.loss = None self.global_step = None self.inc_global_step = None self.train_op = None self.target_update = None self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.build() self.merged = tf.summary.merge_all() self.session = tf.Session() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "learning rate": self.learning_rate, "batch size": self.batch_size, "update rate": self.target_update_rate, "buffer size": self.buffer_size, "build": self.build_mode.name, "train frequency": self.train_freq }) self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session.run(init_op) def build(self): self.action_inputs = tf.placeholder(tf.float32, (None, self.action_dim)) self.reward_inputs = tf.placeholder(tf.float32, (None, )) self.done = tf.placeholder(tf.float32, (None, )) self.state_inputs, self.state_outputs, self.mu_outputs, self.l_outputs, self.value_outputs = \ self.build_network(self.MODEL_NAME) self.next_state_inputs, self.next_state_outputs, _, _, self.target_value_outputs = \ self.build_network(self.TARGET_MODEL_NAME) self.target = tf.expand_dims(self.reward_inputs, 1) + self.discount * ( 1 - tf.expand_dims(self.done, 1)) * self.target_value_outputs # taken from https://github.com/carpedm20/NAF-tensorflow/blob/master/src/network.py pivot = 0 rows = [] for idx in range(self.action_dim): count = self.action_dim - idx diag_elem = tf.exp(tf.slice(self.l_outputs, (0, pivot), (-1, 1))) non_diag_elems = tf.slice(self.l_outputs, (0, pivot + 1), (-1, count - 1)) row = tf.pad(tf.concat((diag_elem, non_diag_elems), 1), ((0, 0), (idx, 0))) rows.append(row) pivot += count L = tf.transpose(tf.stack(rows, axis=1), (0, 2, 1)) P = tf.matmul(L, tf.transpose(L, (0, 2, 1))) adv_term = tf.expand_dims(self.action_inputs - self.mu_outputs, -1) self.advantages = -tf.matmul(tf.transpose(adv_term, [0, 2, 1]), tf.matmul(P, adv_term)) / 2 self.advantages = tf.reshape(self.advantages, [-1, 1]) self.q_values = self.advantages + self.value_outputs self.loss = tf.reduce_mean( architect.huber_loss(self.q_values - tf.stop_gradient(self.target))) tf.summary.scalar("training_loss", self.loss) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) self.train_op = optimizer.minimize(self.loss) self.create_target_update_op() def build_network(self, name): detailed_summary = self.detailed_summary if name == self.TARGET_MODEL_NAME: detailed_summary = False with tf.variable_scope(name): state_inputs = tf.placeholder(tf.float32, shape=(None, self.state_dim)) if self.build_mode == self.Build.SINGLE: state_outputs = architect.dense_block( state_inputs, self.state_layers, name="state_branch", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( state_outputs, [self.mu_layers[-1]], "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( state_outputs, [self.l_layers[-1]], "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( state_outputs, [self.v_layers[-1]], "value_branch", detailed_summary=detailed_summary) elif self.build_mode == self.Build.MULTIPLE: state_outputs = None mu_state = architect.dense_block( state_inputs, self.state_layers, name="mu_state", detailed_summary=detailed_summary) l_state = architect.dense_block( state_inputs, self.state_layers, name="l_state", detailed_summary=detailed_summary) value_state = architect.dense_block( state_inputs, self.state_layers, name="value_state", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( mu_state, [self.mu_layers[-1]], "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( l_state, [self.l_layers[-1]], "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( value_state, [self.v_layers[-1]], "value_branch", detailed_summary=detailed_summary) elif self.build_mode == self.Build.HYDRA: state_outputs = architect.dense_block( state_inputs, self.state_layers, name="state_branch", detailed_summary=detailed_summary) mu_outputs = architect.dense_block( state_outputs, self.mu_layers, "mu_branch", detailed_summary=detailed_summary) l_outputs = architect.dense_block( state_outputs, self.l_layers, "l_branch", detailed_summary=detailed_summary) value_outputs = architect.dense_block( state_outputs, self.v_layers, "value_branch", detailed_summary=detailed_summary) else: raise ValueError("Wrong build type.") return state_inputs, state_outputs, mu_outputs, l_outputs, value_outputs def create_target_update_op(self): # inspired by: https://github.com/yukezhu/tensorflow-reinforce/blob/master/rl/neural_q_learner.py net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.MODEL_NAME) target_net_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.TARGET_MODEL_NAME) self.target_update = [] for v_source, v_target in zip(net_vars, target_net_vars): # this is equivalent to target = (1-alpha) * target + alpha * source update_op = v_target.assign_sub(self.target_update_rate * (v_target - v_source)) self.target_update.append(update_op) self.target_update = tf.group(*self.target_update) def learn(self): # learn batch = self.buffer.sample(self.batch_size) merged, targets, _ = self.session.run( [self.merged, self.target, self.train_op], feed_dict={ self.state_inputs: batch["states"], self.action_inputs: batch["actions"], self.reward_inputs: batch["rewards"], self.next_state_inputs: batch["next_states"], self.done: batch["done"] }) self.summary_writer.add_summary(merged, global_step=self.step) # target update self.session.run(self.target_update) def run_episode(self, env): self.policy.reset() state = env.reset() state, skip = self.prep.process(state) total_reward = 0 while True: # play if skip: action = env.action_space.sample() else: action = self.session.run(self.mu_outputs, feed_dict={self.state_inputs: state})[0] action = self.policy.add_noise(action) tmp_state = state tmp_skip = skip state, reward, done, _ = env.step(action) state, skip = self.prep.process(state) total_reward += reward if not tmp_skip and not tmp_skip: self.buffer.add({ "state": tmp_state[0], "action": action, "reward": reward, "next_state": state[0], "done": int(done) }) if self.step >= self.steps_before_train and not self.solved: # learn for _ in range(self.train_freq): self.learn() _, self.step = self.session.run( [self.inc_global_step, self.global_step]) else: _, self.step = self.session.run( [self.inc_global_step, self.global_step]) if done: break summary_value = summary_pb2.Summary.Value(tag="episode_reward", simple_value=total_reward) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=self.step) if self.max_reward is not None: if total_reward >= self.max_reward: self.solved = True else: self.solved = False if self.step == self.max_iters: self.saver.save(self.session, self.summary_dir, global_step=self.step) return total_reward, self.step def close(self): self.session.close()
class DDPG: CRITIC_NAME = "critic" TARGET_CRITIC_NAME = "target_critic" ACTOR_NAME = "actor" TARGET_ACTOR_NAME = "target_actor" def __init__(self, state_dim, action_dim, monitor_directory, actor_learning_rate=1e-5, critic_learning_rate=1e-3, critic_target_update_rate=1e-3, actor_target_update_rate=1e-3, discount=0.99, l2_decay=1e-2, buffer_size=1000000, batch_size=64, detail_summary=False, tanh_action=True, input_batch_norm=True, all_batch_norm=True, log_frequency=10): self.state_dim = state_dim self.action_dim = action_dim self.critic_learning_rate = critic_learning_rate self.actor_learning_rate = actor_learning_rate self.critic_target_update_rate = critic_target_update_rate self.actor_target_update_rate = actor_target_update_rate self.discount = discount self.batch_size = batch_size self.l2_decay = l2_decay self.buffer_size = buffer_size self.summary_dir = os.path.join(monitor_directory, "summary") self.detail_summary = detail_summary self.tanh_action = tanh_action self.input_batch_norm = input_batch_norm self.all_batch_norm = all_batch_norm self.log_frequency = log_frequency self.step = 0 self.solved = False self.buffer = ReplayBuffer(buffer_size, self.state_dim, self.action_dim) self.__build() self.summary_dir = utils.new_summary_dir(self.summary_dir) utils.log_params( self.summary_dir, { "actor learning rate": self.actor_learning_rate, "critic learning rate": self.critic_learning_rate, "batch size": self.batch_size, "actor update rate": self.actor_target_update_rate, "critic update rate": self.critic_target_update_rate, "buffer size": self.buffer_size, }) self.saver = tf.train.Saver(max_to_keep=None) init_op = tf.global_variables_initializer() self.session = tf.Session() self.merged = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(self.summary_dir, self.session.graph) self.session.run(init_op) """ PUBLIC """ def learn(self): batch = self.buffer.sample(self.batch_size) self.__train_critic(batch["states"], batch["actions"], batch["rewards"], batch["next_states"], batch["done"]) self.__train_actor(batch["states"]) self.session.run([ self.target_critic_update, self.target_actor_update, self.inc_global_step ]) def act(self, state): a = self.session.run(self.action, feed_dict={ self.state_input: state, self.is_training: False })[0] return a def perceive(self, transition): self.buffer.add(transition) def log_scalar(self, name, value, index): summary_value = summary_pb2.Summary.Value(tag=name, simple_value=value) summary_2 = summary_pb2.Summary(value=[summary_value]) self.summary_writer.add_summary(summary_2, global_step=index) def save(self): self.saver.save(self.session, self.summary_dir, global_step=self.session.run(self.global_step)) def close(self): self.session.close() """ PRIVATE """ def __build_critic(self, name, state_input, action_input): bn_training = self.is_training if name == self.TARGET_CRITIC_NAME: bn_training = False with tf.variable_scope(name): # weights and biases W1 = self.__get_weights((self.state_dim, 400), self.state_dim, name="W1") b1 = self.__get_weights((400, ), self.state_dim, name="b1") W2 = self.__get_weights((400, 300), 400 + self.action_dim, name="W2") b2 = self.__get_weights((300, ), 400 + self.action_dim, name="b2") W2_action = self.__get_weights((self.action_dim, 300), 400 + self.action_dim, name="W2_action") W3 = tf.Variable(tf.random_uniform((300, 1), -3e-3, 3e-3), name="W3") b3 = tf.Variable(tf.random_uniform((1, ), -3e-3, 3e-3), name="b3") # layers if self.input_batch_norm: state_input = tf.layers.batch_normalization( state_input, training=bn_training) layer_1 = tf.matmul(state_input, W1) + b1 if self.all_batch_norm: layer_1 = tf.layers.batch_normalization(layer_1, training=bn_training) layer_1 = tf.nn.relu(layer_1) layer_2 = tf.nn.relu( tf.matmul(layer_1, W2) + tf.matmul(action_input, W2_action) + b2) output_layer = tf.matmul(layer_2, W3) + b3 # summary if name == self.CRITIC_NAME: self.critic_summaries = [ tf.summary.histogram("W1", W1), tf.summary.histogram("b1", b1), tf.summary.histogram("W2", W2), tf.summary.histogram("b2", b2), tf.summary.histogram("W2_action", W2_action), tf.summary.histogram("W3", W3), tf.summary.histogram("b3", b3), tf.summary.histogram("layer_1", layer_1), tf.summary.histogram("layer_2", layer_2), tf.summary.histogram("output_layer", output_layer) ] # weight decay weights = [W1, b1, W2, b2, W2_action, W3, b3] weight_decay = tf.add_n( [self.l2_decay * tf.nn.l2_loss(var) for var in weights]) return output_layer, weight_decay def __build_actor(self, name, state_input): bn_training = self.is_training if name == self.TARGET_ACTOR_NAME: bn_training = False with tf.variable_scope(name): # weights and biases W1 = self.__get_weights((self.state_dim, 400), self.state_dim, name="W1") b1 = self.__get_weights((400, ), self.state_dim, name="b1") W2 = self.__get_weights((400, 300), 400, name="W2") b2 = self.__get_weights((300, ), 400, name="b2") W3 = tf.Variable(tf.random_uniform((300, self.action_dim), minval=-3e-3, maxval=3e-3), name="W3") b3 = tf.Variable(tf.random_uniform((self.action_dim, ), -3e-3, 3e-3), name="b3") # layers if self.input_batch_norm: state_input = tf.layers.batch_normalization( state_input, training=bn_training) layer_1 = tf.matmul(state_input, W1) + b1 if self.all_batch_norm: layer_1 = tf.layers.batch_normalization(layer_1, training=bn_training) layer_1 = tf.nn.relu(layer_1) layer_2 = tf.matmul(layer_1, W2) + b2 if self.all_batch_norm: layer_2 = tf.layers.batch_normalization(layer_2, training=bn_training) layer_2 = tf.nn.relu(layer_2) output_layer = tf.matmul(layer_2, W3) + b3 # summary if name == self.ACTOR_NAME: self.actor_summaries = [ tf.summary.histogram("W1", W1), tf.summary.histogram("b1", b1), tf.summary.histogram("W2", W2), tf.summary.histogram("b2", b2), tf.summary.histogram("W3", W3), tf.summary.histogram("b3", b3), tf.summary.histogram("layer_1", layer_1), tf.summary.histogram("layer_2", layer_2), tf.summary.histogram("output_layer", output_layer) ] if self.tanh_action: return tf.nn.tanh(output_layer) else: return output_layer def __build(self): self.state_input = tf.placeholder(tf.float32, shape=(None, self.state_dim), name="state_input") self.next_state_input = tf.placeholder(tf.float32, shape=(None, self.state_dim), name="next_state_input") self.action_input = tf.placeholder(tf.float32, shape=(None, self.action_dim), name="action_input") self.reward_input = tf.placeholder(tf.float32, shape=(None, ), name="reward_input") self.done_input = tf.placeholder(tf.float32, shape=(None, ), name="done_input") self.is_training = tf.placeholder(tf.bool, name="is_training") # inputs summary if self.detail_summary: self.input_summaries = [ tf.summary.histogram("state", self.state_input), tf.summary.histogram("next_state", self.next_state_input), tf.summary.histogram("action", self.action_input), tf.summary.histogram("reward", self.reward_input), tf.summary.histogram("done", self.done_input) ] self.target_action = self.__build_actor(self.TARGET_ACTOR_NAME, self.next_state_input) self.q_value, weight_decay = self.__build_critic( self.CRITIC_NAME, self.state_input, self.action_input) self.target_q_value, _ = self.__build_critic(self.TARGET_CRITIC_NAME, self.next_state_input, self.target_action) self.tmp = tf.expand_dims(self.reward_input, 1) self.targets = tf.expand_dims(self.reward_input, 1) + self.discount * ( 1 - tf.expand_dims(self.done_input, 1)) * self.target_q_value self.diff = self.targets - self.q_value self.loss = tf.reduce_mean( tf.square(tf.stop_gradient(self.targets) - self.q_value)) + weight_decay self.loss_summary = tf.summary.scalar("critic_loss", self.loss) self.critic_train_op = tf.train.AdamOptimizer( self.critic_learning_rate).minimize(self.loss) # add critic batch norm. update if self.input_batch_norm or self.all_batch_norm: self.critic_bn_update_op = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=self.CRITIC_NAME) self.critic_bn_update_op = tf.group(*self.critic_bn_update_op) self.critic_train_op = tf.group(self.critic_train_op, self.critic_bn_update_op) self.action = self.__build_actor(self.ACTOR_NAME, self.state_input) self.actor_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.ACTOR_NAME) self.action_gradients = tf.gradients(self.q_value, self.action_input)[0] self.actor_params_gradient = tf.gradients(self.action, self.actor_params, -self.action_gradients) # actor gradients summary if self.detail_summary: self.actor_summaries.append( tf.summary.histogram("action_gradient", self.action_gradients)) for grad in self.actor_params_gradient: self.actor_summaries.append( tf.summary.histogram("actor_parameter_gradients", grad)) self.actor_train_op = tf.train.AdamOptimizer( self.actor_learning_rate).apply_gradients( zip(self.actor_params_gradient, self.actor_params)) # add actor batch norm. update if self.input_batch_norm or self.all_batch_norm: self.actor_bn_update_op = tf.get_collection( tf.GraphKeys.UPDATE_OPS, scope=self.ACTOR_NAME) self.actor_bn_update_op = tf.group(*self.actor_bn_update_op) self.actor_train_op = tf.group(self.actor_train_op, self.actor_bn_update_op) self.target_critic_update = architect.create_target_update_ops( self.CRITIC_NAME, self.TARGET_CRITIC_NAME, self.critic_target_update_rate) self.target_actor_update = architect.create_target_update_ops( self.ACTOR_NAME, self.TARGET_ACTOR_NAME, self.actor_target_update_rate) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.inc_global_step = tf.assign(self.global_step, tf.add(self.global_step, 1)) # group summaries self.critic_summaries = tf.summary.merge(self.critic_summaries) if self.detail_summary: self.actor_summaries = tf.summary.merge(self.actor_summaries) self.input_summaries = tf.summary.merge(self.input_summaries) @staticmethod def __get_weights(shape, input_shape, name="var"): return tf.Variable(tf.random_uniform(shape, -1 / math.sqrt(input_shape), 1 / math.sqrt(input_shape)), name=name) def __train_actor(self, states): actions = self.session.run(self.action, feed_dict={ self.state_input: states, self.is_training: True }) self.session.run(self.actor_train_op, feed_dict={ self.state_input: states, self.action_input: actions, self.is_training: True }) def __train_critic(self, states, actions, rewards, next_states, done): feed_dict = { self.state_input: states, self.action_input: actions, self.reward_input: rewards, self.next_state_input: next_states, self.done_input: done, self.is_training: True } step = self.session.run(self.global_step) if step % self.log_frequency == 0: ops = [self.critic_train_op, self.loss_summary] if self.detail_summary: ops.append(self.actor_summaries) ops.append(self.input_summaries) res = self.session.run(ops, feed_dict=feed_dict) self.summary_writer.add_summary(res[1], global_step=step) if self.detail_summary: self.summary_writer.add_summary(res[2], global_step=step) self.summary_writer.add_summary(res[3], global_step=step) else: self.session.run(self.critic_train_op, feed_dict=feed_dict)
class DDPG_Agent(Agent): """Interacts with and learns from the environment.""" policy_type = "DDPG" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ super().__init__() self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_target = DDPG_Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_target = DDPG_Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) #Statistics self.stats = { "actor_loss": [], "critic_loss": [], "reward_sum": [], } def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() action = self.actor_local.select_action(state) self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() #tmp = np.array((critic_loss.item(), actor_loss.item())) #print(tmp) # --------------------------- for the plot ----------------------------- # # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) with torch.no_grad(): actions_pred_target = self.actor_target(states) actor_loss_target = -self.critic_target( states, actions_pred_target).mean() Q_expected_target = self.critic_target(states, actions) critic_loss_target = F.mse_loss(Q_expected_target, Q_targets) with open("saveDDPG_critic-actor_loss.csv", "a") as f: tmp = str(critic_loss_target.item()) + "," + str( actor_loss_target.item()) + "\n" f.write(tmp) self.save_stats(actor_loss=actor_loss.item(), critic_loss=critic_loss.item(), reward_sum=rewards.sum().item()) def store_policy(self, env_name, score): traced = torch.jit.script(self.actor_target) torch.jit.save( traced, "data/policies/" + "DDPGAgent" + str(env_name) + "#" + str(score) + ".zip") def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(algorithms): def __init__(self, args): super().__init__(args) state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] self.actor = DeterministicPolicy(state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target = DeterministicPolicy( state_dim, action_dim, 64, self.env.action_space).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) self.critic = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target = QNetwork(state_dim, action_dim, 64).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 if self.args.last_episode > 0: self.load(self.args.last_episode) def update(self): for it in range(self.args.update_iteration): # sample from replay buffer x, y, u, r, d = self.replay_buffer.sample(self.args.batch_size) state = torch.FloatTensor(x).to(device) action = torch.FloatTensor(u).to(device) next_state = torch.FloatTensor(y).to(device) done = torch.FloatTensor(d).to(device) reward = torch.FloatTensor(r).to(device) # computer the target Q value next_action, _, _ = self.actor_target.sample(next_state) target_Q = self.critic_target(next_state, next_action) target_Q = reward + ( (1 - done) * self.args.gamma * target_Q).detach() # get current Q estimate current_Q = self.critic(state, action) # compute cirtic loss and update critic_loss = F.mse_loss(current_Q, target_Q) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # computer actor loss actor_action, _, _ = self.actor.sample(state) actor_loss = -self.critic(state, actor_action).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target model for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 def train(self): for i in range(self.args.max_episode): state = self.env.reset() ep_r = 0 for t in count(): action, _, _ = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] next_state, reward, done, info = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push( (state, next_state, action, reward, np.float(done))) state = next_state if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print( "Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}" .format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) break if len(self.replay_buffer.storage) >= self.args.capacity - 1: self.update() self.save(i + 1) def evaluate(self, number=1, render=True): rewards = [] for _ in range(number): total_rews = 0 time_step = 0 done = False state = self.env.reset() while not done: with torch.no_grad(): # use the mean action _, _, action = self.actor.sample( torch.FloatTensor([state]).to(device)) action = action.cpu().detach().numpy()[0] if render: self.env.render() state, reward, done, _ = self.env.step(action) total_rews += reward time_step += 1 if render: print("total reward of this episode is " + str(total_rews)) rewards.append(total_rews) rewards = np.array(rewards) if not render: pickle.dump((self.global_steps, rewards), self.log_file) print("mean reward {}, max reward {}".format(rewards.mean(), rewards.max())) def load(self, episode=None): file_name = self.weights_file(episode) checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic.load_state_dict(checkpoint['critic']) self.critic.load_state_dict(checkpoint['critic_target']) print("successfully load model from " + file_name) def save(self, episode=None): file_name = self.weights_file(episode) torch.save( { 'actor': self.actor.state_dict(), 'critic': self.critic.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_target': self.critic_target.state_dict() }, file_name) print("save model to " + file_name)
class DDPG(): def __init__(self, args, env = None): self.args = args # actor self.actor = DeterministicPolicy(128).to(device) self.actor_target = DeterministicPolicy(128).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = optim.Adam(self.actor.parameters(), self.args.lr) # critics self.critic = QNetwork(128).to(device) self.critic_target = QNetwork(128).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = optim.Adam(self.critic.parameters(), self.args.lr) self.replay_buffer = ReplayBuffer(self.args.capacity) self.num_critic_update_iteration = 0 self.num_actor_update_iteration = 0 self.num_training = 0 self.global_steps = 0 self.action_scale = torch.FloatTensor([[20, 1]]).to(device) self.env = env #self.load() def update(self): for it in range(self.args.update_iteration): # sample from replay buffer obs, local_goal, next_obs, next_goal, action, reward, done = self.replay_buffer.sample(self.args.batch_size) obs = torch.FloatTensor(obs).to(device) local_goal = torch.FloatTensor(local_goal).to(device) next_obs = torch.FloatTensor(next_obs).to(device) next_goal = torch.FloatTensor(next_goal).to(device) action = torch.FloatTensor(action).to(device) reward = torch.FloatTensor(reward).to(device) done = torch.FloatTensor(done).to(device) # computer the target Q value next_action, _ = self.actor_target.sample(next_obs, next_goal) target_Q = self.critic_target(next_obs, next_goal, next_action / self.action_scale) target_Q = reward + ((1-done) * self.args.gamma * target_Q).detach() # get current Q estimate current_Q = self.critic(obs, local_goal, action) # compute cirtic loss and update critic_loss = F.mse_loss(current_Q, target_Q) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # computer actor loss actor_action, _ = self.actor.sample(obs, local_goal) actor_loss = -self.critic(obs, local_goal, actor_action / self.action_scale).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target model for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 def train(self): for i in range(self.args.max_episode): obs, local_goal = self.env.reset() ep_r = 0 for t in count(): action, _ = self.actor.sample(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device)) action = action.cpu().detach().numpy()[0] next_obs, next_goal, done, reward = self.env.step(action) self.global_steps += 1 ep_r += reward self.replay_buffer.push((obs / 4.0, local_goal / 20., next_obs / 4.0, next_goal / 20., action / np.array([20, 1]), reward, np.float(done))) obs = next_obs local_goal = next_goal if done or t > self.args.max_length_trajectory: if i % self.args.print_log == 0: print("Ep_i \t {}, the ep_r is \t{:0.2f}, the step is \t{}, global_steps is {}".format(i, ep_r, t, self.global_steps)) self.evaluate(10, False) break if len(self.replay_buffer.storage) >= self.args.capacity * 0.2: self.update() self.save() def evaluate(self, number = 1, render = True): rewards = [] for _ in range(number): total_rews = 0 time_step = 0 done = False obs, local_goal = self.env.reset() while not done: action = self.predict(obs / 4., local_goal / 20.) # with torch.no_grad(): # # use the mean action # _, action = self.actor.sample(torch.FloatTensor(obs).to(device) / 4., torch.FloatTensor(local_goal).to(device) / 20) # action = action.cpu().detach().numpy()[0] obs, local_goal, done, reward = self.env.step(action) if render: self.env.render() total_rews += reward time_step += 1 if time_step > self.args.max_length_trajectory: break #print(str(action) + " " + str(local_goal)) if done: break rewards.append(total_rews) rewards = np.array(rewards) print("mean reward {}, max reward {}, min reward {}".format(rewards.mean(), rewards.max(), rewards.min())) def predict(self, obs, local_goal): with torch.no_grad(): action = self.actor.forward(torch.FloatTensor(obs).to(device), torch.FloatTensor(local_goal).to(device)) action = action.cpu().detach().numpy()[0] return action def load(self, episode = None): file_name = "weights/DDPG.pt" checkpoint = torch.load(file_name) self.actor.load_state_dict(checkpoint['actor']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic.load_state_dict(checkpoint['critic']) self.critic.load_state_dict(checkpoint['critic_target']) print("successfully load model from " + file_name) def save(self, episode = None): file_name = "weights/DDPG.pt" torch.save({'actor' : self.actor.state_dict(), 'critic' : self.critic.state_dict(), 'actor_target' : self.actor_target.state_dict(), 'critic_target' : self.critic_target.state_dict()}, file_name) print("save model to " + file_name)
# initialize variables (and target network) sess.run(init) Ws,bs = Q.get_weights() Q_target.assign(sess, Ws,bs) ann_fric = (1-EPSILON)/ANNEALING EXP_PROB = 1 # initialize environment env = gym.make(ENVIRONMENT) # initialize mdp state structure mdp = MDP_state(STATE_SIZE, FRAMES) # initialize replay buffer R = ReplayBuffer(MDP_STATE_SIZE, 1, BUFFER_SIZE) buf = R.LoadBuffer(OUT_DIR+BUFFER_FILE) if buf: EXP_PROB = EPSILON populated = R.GetOccupency() print("Replay buffer loaded from disk, occupied: " + str(populated)) else: print("Creating new replay buffer") # load saved model ckpt = tf.train.get_checkpoint_state(OUT_DIR) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess,ckpt.model_checkpoint_path) print("Model loaded from disk") # define action discretization
class BaseAgent: def __init__(self, features: int, actions: int, params: Dict, seed: int, collector: Collector): self.features = features self.actions = actions self.params = params self.collector = collector self.seed = seed # define parameter contract self.gamma = params['gamma'] self.epsilon = params.get('epsilon', 0) # the mellowmax parameter self.omega = params.get('omega', 1.0) # set up network for estimating Q(s, a) self.value_net = Network(features, actions, params, seed).to(device) # build the optimizer self.optimizer_params = params['optimizer'] self.optimizer = deserializeOptimizer(self.value_net.parameters(), self.optimizer_params) self.steps = 0 # set up the replay buffer self.buffer_size = params['buffer_size'] self.batch_size = params['batch'] self.buffer_type = params.get('buffer', 'standard') if self.buffer_type == 'per': prioritization = params['prioritization'] self.buffer = PrioritizedReplayMemory(self.buffer_size, prioritization) else: self.buffer = ReplayBuffer(self.buffer_size) # build a target network self.target_refresh = params.get('target_refresh', 1) self.target_net = copy.deepcopy(self.value_net) self.initializeTargetNet() def getValues(x: torch.Tensor): qs = self.values(x).detach().cpu().squeeze(0).numpy() return qs self.policy = createEpsilonGreedy(seed, self.epsilon, getValues) # return the Q(s, a) values from the value network def values(self, x): return self.value_net(x)[0] # sample an action according to our policy def selectAction(self, x): return self.policy.selectAction(x) def initializeTargetNet(self): # if we aren't using target nets, then save some compute if self.target_refresh > 1: self.target_net = copy.deepcopy(self.value_net) cloneNetworkWeights(self.value_net, self.target_net) else: self.target_net = self.value_net @abstractmethod def updateNetwork(self, batch: Batch, predictions: Dict): pass @abstractmethod def forward(self, batch: Batch) -> Dict[str, torch.Tensor]: pass @abstractmethod def bootstrap(self, batch: Batch, next_values: torch.Tensor) -> Dict[str, torch.Tensor]: pass # a helper method that lets us bypass combining gradients whenever # target networks are disabled def combineTargetGrads(self): if self.target_net == self.value_net: return addGradients_(self.value_net, self.target_net) def update(self, s, a, sp, r, gamma): self.buffer.add((s, a, sp, r, gamma)) self.steps += 1 if self.steps % self.target_refresh == 0 and self.target_refresh > 1: cloneNetworkWeights(self.value_net, self.target_net) if len(self.buffer) > self.batch_size + 1: samples, idcs = self.buffer.sample(self.batch_size) batch = getBatchColumns(samples) predictions = self.forward(batch) tde = self.updateNetwork(batch, predictions) self.buffer.update_priorities(idcs, tde)