def __init__(self, agent, policy_model, state_value_model, total_reward): self.agent = agent self.policy_model = policy_model self.state_value_model = state_value_model self.total_reward = total_reward # Compute MLE loss function. MLE is used to initialize parameters for reinforce self.mle_policy_gradient = MaximumLikelihoodEstimation(agent, policy_model) # Compute reinforce loss function loss_reinforce, entropy_penalty = self.calc_loss( policy_model.model_output, policy_model.model_output_indices, policy_model.target) optimizer = tf.train.AdamOptimizer(self.rl_learning_rate) using_grad_clip = True grad_clip_val = 5.0 if not using_grad_clip: train_step = optimizer.minimize(loss_reinforce) else: gvs = optimizer.compute_gradients(loss_reinforce) capped_gvs = [(tf.clip_by_norm(grad, grad_clip_val), var) if grad is not None else (grad, var) for grad, var in gvs] train_step = optimizer.apply_gradients(capped_gvs) # Create summaries for training summary_loss = tf.scalar_summary("Loss", loss_reinforce) summary_target_min = tf.scalar_summary("Target Min", tf.reduce_min(self.policy_model.target)) summary_target_max = tf.scalar_summary("Target Max", tf.reduce_max(self.policy_model.target)) summary_target_mean = tf.scalar_summary("Target Mean", tf.reduce_mean(self.policy_model.target)) summary_entropy_penalty = tf.scalar_summary("Entropy Penalty", entropy_penalty) update_summaries = [summary_loss, summary_target_min, summary_target_max, summary_target_mean, summary_entropy_penalty] AbstractLearning.__init__(self, policy_model, loss_reinforce, train_step, update_summaries)
def __init__(self, agent, policy_model): self.agent = agent self.policy_model = policy_model # Replay memory max_replay_memory_size = 2000 self.replay_memory = collections.deque(maxlen=max_replay_memory_size) rho = 0.5 self.ps = prioritized_sweeping.PrioritizedSweeping(0, rho) optimizer = tf.train.AdamOptimizer(self.mle_learning_rate) loss = MaximumLikelihoodEstimation.calc_loss( self.policy_model.model_output, self.policy_model.model_output_indices) using_grad_clip = True grad_clip_val = 5.0 if not using_grad_clip: train_step = optimizer.minimize(loss) else: gvs = optimizer.compute_gradients(loss) capped_gvs = [(tf.clip_by_norm(grad, grad_clip_val), var) if grad is not None else (grad, var) for grad, var in gvs] train_step = optimizer.apply_gradients(capped_gvs) # Create summaries for training summary_loss = tf.scalar_summary("Loss", loss) update_summaries = [summary_loss] AbstractLearning.__init__(self, policy_model, loss, train_step, update_summaries)
def __init__(self, model, action_space, meta_data_util, config, constants, tensorboard): self.max_epoch = 100 # constants["max_epochs"] self.model = model self.action_space = action_space self.meta_data_util = meta_data_util self.config = config self.constants = constants self.tensorboard = tensorboard self.entropy = None self.cross_entropy = None self.entropy_coef = constants["entropy_coefficient"] self.optimizer = optim.Adam(model.get_parameters(), lr=constants["learning_rate"]) ################### self.confusion_num_count = [] self.confusion_denom_count = [] for i in range(0, 63): self.confusion_num_count.append([0.0] * 63) self.confusion_denom_count.append([0.0] * 63) ################### AbstractLearning.__init__(self, self.model, self.calc_loss, self.optimizer, self.config, self.constants)
def __init__(self, agent): self.agent = agent self.replay_memory = None self.batch_size = None self.null_previous_action = None self.ps = None AbstractLearning.__init__(agent)
def __init__(self, model, config, constants, tensorboard): self.max_epoch = constants["max_epochs"] self.model = model self.config = config self.constants = constants self.tensorboard = tensorboard self.entropy_coef = constants["entropy_coefficient"] self.optimizer = optim.Adam(model.get_parameters(), lr=constants["learning_rate"]) AbstractLearning.__init__(self, self.model, self.calc_loss, self.optimizer, self.config, self.constants, self.tensorboard)
def __init__(self, model, action_space, meta_data_util, config, constants): self.max_epoch = constants["max_epochs"] self.model = model self.action_space = action_space self.meta_data_util = meta_data_util self.config = config self.constants = constants self.tensorboard = Tensorboard() self.entropy_coef = constants["entropy_coefficient"] self.optimizer = optim.Adam(model.get_parameters(), lr=constants["learning_rate"]) AbstractLearning.__init__(self, self.model, self.calc_loss, self.optimizer, self.config, self.constants)
def __init__(self, model, action_space, meta_data_util, config, constants, tensorboard): self.max_epoch = constants["max_epochs"] self.model = model self.action_space = action_space self.meta_data_util = meta_data_util self.config = config self.constants = constants self.tensorboard = tensorboard self.global_replay_memory = collections.deque(maxlen=2000) self.optimizer = optim.Adam(model.get_parameters(), lr=constants["learning_rate"]) AbstractLearning.__init__(self, self.model, self.calc_loss, self.optimizer, self.config, self.constants, self.tensorboard)
def __init__(self, model, action_space, meta_data_util, config, constants, tensorboard): self.max_epoch = 100 # constants["max_epochs"] self.model = model self.action_space = action_space self.meta_data_util = meta_data_util self.config = config self.constants = constants self.tensorboard = tensorboard self.entropy_coef = constants["entropy_coefficient"] self.optimizer = optim.Adam(model.get_parameters(), lr=constants["learning_rate"]) self.linguistic_prior = LinguisticPrior() # self.alignment_reward = AlignmentReward() self.entropy = None AbstractLearning.__init__(self, self.model, self.calc_loss, self.optimizer, self.config, self.constants)
def __init__(self, agent, q_network, target_q_network): """ Creates constructor for an abstract learning setup """ self.agent = agent self.loss = None self.q_network = q_network self.target_q_network = target_q_network # Define epsilon greedy behaviour policy epsilon = 1.0 min_epsilon = 0.1 self.behaviour_policy = egp.EpsilonGreedyPolicy(epsilon, min_epsilon) # Replay memory and prioritized sweeping for sampling from the replay memory max_replay_memory_size = 2000 self.replay_memory = collections.deque(maxlen=max_replay_memory_size) rho = 0.5 self.ps = prioritized_sweeping.PrioritizedSweeping(0, rho) optimizer = tf.train.AdamOptimizer(self.rl_learning_rate) loss = self.calc_loss(self.q_network.model_output, self.q_network.model_output_indices, self.q_network.target) using_grad_clip = True grad_clip_val = 5.0 if not using_grad_clip: train_step = optimizer.minimize(loss) else: gvs = optimizer.compute_gradients(loss) capped_gvs = [(tf.clip_by_norm(grad, grad_clip_val), var) if grad is not None else (grad, var) for grad, var in gvs] train_step = optimizer.apply_gradients(capped_gvs) # Create summaries for training summary_loss = tf.scalar_summary("Loss", loss) update_summaries = [summary_loss] AbstractLearning.__init__(self, q_network, loss, train_step, update_summaries)
def __init__(self, model, action_space, meta_data_util, config, constants, tensorboard): self.max_epoch = constants["max_epochs"] self.model = model self.action_space = action_space self.meta_data_util = meta_data_util self.config = config self.constants = constants self.tensorboard = tensorboard self.global_replay_memory = collections.deque(maxlen=2000) self.optimizer = optim.Adam(model.get_parameters(), lr=constants["learning_rate"]) theta_values = [] for i in range(0, 48): theta_values.append([i * 7.5]) self.theta_values = cuda_var(torch.from_numpy( np.array(theta_values))).float() AbstractLearning.__init__(self, self.model, self.calc_loss, self.optimizer, self.config, self.constants, self.tensorboard)
def __init__(self, model, action_space, meta_data_util, config, constants, tensorboard, resnet_detection_model): self.max_epoch = constants["max_epochs"] self.model = model self.resnet_detection_model = resnet_detection_model self.action_space = action_space self.meta_data_util = meta_data_util self.config = config self.constants = constants self.tensorboard = tensorboard self.discretize = nav_drone_symbolic_instructions.BUCKET_WIDTH self.num_buckets = nav_drone_symbolic_instructions.NO_BUCKETS self.global_replay_memory = collections.deque(maxlen=2000) self.optimizer = optim.Adam(model.get_parameters(), lr=constants["learning_rate"]) theta_values = [] for i in range(0, self.num_buckets): theta_values.append([i * self.discretize]) self.theta_values = cuda_var(torch.from_numpy(np.array(theta_values))).float() AbstractLearning.__init__(self, self.model, self.calc_loss, self.optimizer, self.config, self.constants, self.tensorboard)