Beispiel #1
0
    def __init__(self, agent, policy_model, total_reward):
        self.agent = agent
        self.policy_model = policy_model
        self.total_reward = total_reward

        # Compute MLE loss function. MLE is used to initialize parameters for policy gradient
        self.mle_policy_gradient = MaximumLikelihoodEstimation(agent, policy_model)

        # Compute loss function
        loss, entropy_penalty = self.calc_loss(
            self.policy_model.model_output, self.policy_model.model_output_indices, self.policy_model.target)

        optimizer = tf.train.AdamOptimizer(AbstractLearning.rl_learning_rate)

        using_grad_clip = True
        grad_clip_val = 5.0
        if not using_grad_clip:
            train_step = optimizer.minimize(loss)
        else:
            gvs = optimizer.compute_gradients(loss)
            capped_gvs = [(tf.clip_by_norm(grad, grad_clip_val), var)
                          if grad is not None else (grad, var) for grad, var in gvs]
            train_step = optimizer.apply_gradients(capped_gvs)

        # Create summaries for training
        summary_loss = tf.scalar_summary("Loss", loss)
        summary_target_min = tf.scalar_summary("Target Min", tf.reduce_min(self.policy_model.target))
        summary_target_max = tf.scalar_summary("Target Max", tf.reduce_max(self.policy_model.target))
        summary_target_mean = tf.scalar_summary("Target Mean", tf.reduce_mean(self.policy_model.target))
        summary_entropy_penalty = tf.scalar_summary("Entropy Penalty", entropy_penalty)
        update_summaries = [summary_loss, summary_target_min,
                            summary_target_max, summary_target_mean, summary_entropy_penalty]

        AbstractLearning.__init__(self, policy_model, loss, train_step, update_summaries)
Beispiel #2
0
    def __init__(self, train_alg, config, constants):

        # Initialize logger
        logger.Log.open("./log_" + str(datetime.now()) + ".txt")

        self.config = config

        # Connect to simulator
        if len(sys.argv) < 2:
            logger.Log.info("IP not given. Using localhost i.e. 0.0.0.0")
            self.unity_ip = "0.0.0.0"
        else:
            self.unity_ip = sys.argv[1]

        if len(sys.argv) < 3:
            logger.Log.info("PORT not given. Using 11000")
            self.PORT = 11000
        else:
            self.PORT = int(sys.argv[2])

        # Size of image
        image_dim = self.config.screen_size
        self.connection = rc.ReliableConnect(self.unity_ip, self.PORT,
                                             image_dim)
        self.connection.connect()

        # Dataset specific parameters
        self.num_block = 20
        self.num_direction = 4
        use_stop = True
        if use_stop:
            self.num_actions = self.num_block * self.num_direction + 1  # 1 for stopping
        else:
            self.num_actions = self.num_block * self.num_direction

        # Create toolkit of message protocol between simulator and agent
        self.message_protocol_kit = mpu.MessageProtocolUtil(
            self.num_direction, self.num_actions, use_stop)

        # Test policy
        self.test_policy = gp.GenericPolicy.get_argmax_action

        # MDP details
        self.gamma = 1.0

        # Training algorithm behaviour
        self.train_alg = train_alg

        # Define model and learning algorithm
        if self.train_alg == SUPERVISEDMLE:
            self.model = PolicyNetwork(image_dim, self.num_actions, constants)
            self.learning_alg = MaximumLikelihoodEstimation(self, self.model)
        elif self.train_alg == REINFORCE:
            self.model = PolicyNetwork(image_dim, self.num_actions, constants)
            self.learning_alg = PolicyGradient(self,
                                               self.model,
                                               total_reward=True)
        elif self.train_alg == CONTEXTUALBANDIT:
            self.model = PolicyNetwork(image_dim, self.num_actions, constants)
            self.learning_alg = PolicyGradient(self,
                                               self.model,
                                               total_reward=False)
        elif self.train_alg == PGADVANTAGE:
            self.model = PolicyNetwork(image_dim, self.num_actions, constants)
            self.state_value_model = StateValueFunctionModel(
                250, image_dim, 200, 24, 32)
            self.learning_alg = PolicyGradientWithAdvantage(
                self, self.model, self.state_value_model, total_reward=True)
        elif self.train_alg == SIMPLEQLEARNING:
            self.model = ActionValueFunctionNetwork(250, image_dim, 200, 24,
                                                    32)
            self.target_q_network = ActionValueFunctionNetwork(
                250, image_dim, 200, 24, 32, scope_name="Target_Q_Network")
            self.learning_alg = QLearning(self, self.model,
                                          self.target_q_network)
        else:
            raise AssertionError("Training algorithm " + str(self.train_alg) +
                                 " not found or implemented.")

        self.sess = None
        self.train_writer = None
        self.config.log_flag()
        logger.Log.info("Training Algorithm: " + str(self.train_alg) +
                        ", Gamma: " + str(self.gamma))
        logger.Log.info("Created Agent.")