Example #1
0
    def __init__(self, agent, policy_model):
        self.agent = agent
        self.policy_model = policy_model

        # Replay memory
        max_replay_memory_size = 2000
        self.replay_memory = collections.deque(maxlen=max_replay_memory_size)
        rho = 0.5
        self.ps = prioritized_sweeping.PrioritizedSweeping(0, rho)

        optimizer = tf.train.AdamOptimizer(self.mle_learning_rate)
        loss = MaximumLikelihoodEstimation.calc_loss(
            self.policy_model.model_output, self.policy_model.model_output_indices)

        using_grad_clip = True
        grad_clip_val = 5.0
        if not using_grad_clip:
            train_step = optimizer.minimize(loss)
        else:
            gvs = optimizer.compute_gradients(loss)
            capped_gvs = [(tf.clip_by_norm(grad, grad_clip_val), var)
                          if grad is not None else (grad, var) for grad, var in gvs]
            train_step = optimizer.apply_gradients(capped_gvs)

        # Create summaries for training
        summary_loss = tf.scalar_summary("Loss", loss)
        update_summaries = [summary_loss]

        AbstractLearning.__init__(self, policy_model, loss, train_step, update_summaries)
Example #2
0
    def __init__(self, agent, q_network, target_q_network):
        """ Creates constructor for an abstract learning setup """

        self.agent = agent
        self.loss = None
        self.q_network = q_network
        self.target_q_network = target_q_network

        # Define epsilon greedy behaviour policy
        epsilon = 1.0
        min_epsilon = 0.1
        self.behaviour_policy = egp.EpsilonGreedyPolicy(epsilon, min_epsilon)

        # Replay memory and prioritized sweeping for sampling from the replay memory
        max_replay_memory_size = 2000
        self.replay_memory = collections.deque(maxlen=max_replay_memory_size)
        rho = 0.5
        self.ps = prioritized_sweeping.PrioritizedSweeping(0, rho)

        optimizer = tf.train.AdamOptimizer(self.rl_learning_rate)
        loss = self.calc_loss(self.q_network.model_output,
                              self.q_network.model_output_indices,
                              self.q_network.target)

        using_grad_clip = True
        grad_clip_val = 5.0
        if not using_grad_clip:
            train_step = optimizer.minimize(loss)
        else:
            gvs = optimizer.compute_gradients(loss)
            capped_gvs = [(tf.clip_by_norm(grad, grad_clip_val),
                           var) if grad is not None else (grad, var)
                          for grad, var in gvs]
            train_step = optimizer.apply_gradients(capped_gvs)

        # Create summaries for training
        summary_loss = tf.scalar_summary("Loss", loss)
        update_summaries = [summary_loss]

        AbstractLearning.__init__(self, q_network, loss, train_step,
                                  update_summaries)