コード例 #1
0
ファイル: reinforce.py プロジェクト: yotaro-shimose/TSPMDP
    def __init__(
        self,
        encoder_builder: Callable,
        decoder_builder: Callable,
        n_epochs: int = 10000,
        n_nodes: int = 20,
        n_iterations: int = 10,
        n_validations: int = 100,
        n_parallels: int = 5,
        learning_rate: float = 1e-5,
        significance: float = 0.15,
        logger: TFLogger = None,
        save_dir="./models/",
        load_dir=None,
    ):
        self.n_epochs = n_epochs
        self.n_iterations = n_iterations
        self.n_validations = n_validations
        self.n_parallels = n_parallels
        self.online_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes)
        self.baseline_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes)
        self.save_dir = save_dir

        self.online_encoder: tf.keras.models.Model = encoder_builder()
        self.online_decoder: tf.keras.models.Model = decoder_builder()
        self.baseline_encoder: tf.keras.models.Model = encoder_builder()
        self.baseline_decoder: tf.keras.models.Model = decoder_builder()

        if load_dir:
            self.load(load_dir)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.significance = significance
        self.logger = logger
コード例 #2
0
ファイル: test_env.py プロジェクト: yotaro-shimose/TSPMDP
def raise_at_final(env: TSPMDP, actions: list):
    env.reset()
    answer = False
    for i in range(len(actions)):
        action = actions[i]
        try:
            action = tf.constant([action], dtype=tf.int32)
            env.step(action)
        except tf.errors.InvalidArgumentError:
            if i == len(actions) - 1:
                answer = True
            else:
                break
    return answer
コード例 #3
0
ファイル: test_env.py プロジェクト: yotaro-shimose/TSPMDP
def complete_synario(env: TSPMDP, actions: list):
    env.reset()
    answer = False
    is_terminal = False
    for i in range(len(actions)):
        action = actions[i]
        try:
            action = tf.constant([action], dtype=tf.int32)
            _, _, is_terminal = env.step(action)
        except tf.errors.InvalidArgumentError:
            break
    else:
        if is_terminal:
            answer = True
    return answer
コード例 #4
0
ファイル: test_env.py プロジェクト: yotaro-shimose/TSPMDP
def test_env_mask():
    env = TSPMDP(batch_size=1, n_nodes=10)
    actions = [0]
    raise_at_final(env, actions)
    actions = [3, 4, 4]
    raise_at_final(env, actions)
    actions = [1, 2, 3, 4, 0]
    raise_at_final(env, actions)
    actions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 9]
コード例 #5
0
ファイル: test_env.py プロジェクト: yotaro-shimose/TSPMDP
def test_env_rewards():
    batch_size = 1
    n_nodes = 4
    reward_on_episode = False
    env = TSPMDP(batch_size=batch_size,
                 n_nodes=n_nodes,
                 reward_on_episode=reward_on_episode)
    env.reset()
    state_dict = env.export_states()
    new_coordinates = tf.constant([[[0., 0.], [0., 1.], [1., 1.], [1., 0.]]])
    state_dict["coordinates"] = new_coordinates
    env.import_states(state_dict)
    actions = [1, 2, 3]
    episode_reward = 0
    for action in actions:
        _, reward, _ = env.step(tf.constant([action], dtype=tf.int32))
        episode_reward += reward
    assert episode_reward == -n_nodes
コード例 #6
0
ファイル: test_env.py プロジェクト: yotaro-shimose/TSPMDP
def test_env_reward_on_episode():
    B, N = 128, 100
    original = TSPMDP(batch_size=B, n_nodes=N, reward_on_episode=True)
    original.reset()
    copy = TSPMDP(batch_size=B, n_nodes=N, reward_on_episode=False)
    init_state = original.export_states()
    init_state.pop("rewards")
    copy_sum = 0
    copy.import_states(init_state)
    actions = tf.constant([list(range(1, N)) for _ in range(B)],
                          dtype=tf.int32)
    original_reward = None
    for j in range(actions.shape[1]):
        action = actions[:, j]
        _, original_reward, _ = original.step(action)
        _, copy_reward, _ = copy.step(action)
        assert tf.reduce_all(
            original_reward == 0.) or j == actions.shape[1] - 1
        copy_sum += copy_reward
    tf.assert_equal(original_reward, copy_sum)
コード例 #7
0
ファイル: test_env.py プロジェクト: yotaro-shimose/TSPMDP
def test_env_synchronization():
    original = TSPMDP(batch_size=1, n_nodes=10)
    original.reset()
    copy = TSPMDP(batch_size=1, n_nodes=10)
    copy.import_states(original.export_states())
    original_sum = 0
    copy_sum = 0
    actions = [1, 2, 3, 4, 5, 6, 7, 8, 9]
    for i in range(len(actions)):
        action = actions[i]
        action = tf.constant([action], dtype=tf.int32)
        _, original_reward, _ = original.step(action)
        _, copy_reward, _ = copy.step(action)
        original_sum += original_reward
        copy_sum += copy_reward
    assert original_sum == copy_sum
コード例 #8
0
ファイル: test_env.py プロジェクト: yotaro-shimose/TSPMDP
def test_env_complete():
    env = TSPMDP(batch_size=1, n_nodes=10)
    actions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
    complete_synario(env, actions)
コード例 #9
0
 def __call__(self):
     return TSPMDP(batch_size=self.batch_size,
                   n_nodes=self.n_nodes,
                   reward_on_episode=self.reward_on_episode)
コード例 #10
0
ファイル: reinforce.py プロジェクト: yotaro-shimose/TSPMDP
class Reinforce:
    def __init__(
        self,
        encoder_builder: Callable,
        decoder_builder: Callable,
        n_epochs: int = 10000,
        n_nodes: int = 20,
        n_iterations: int = 10,
        n_validations: int = 100,
        n_parallels: int = 5,
        learning_rate: float = 1e-5,
        significance: float = 0.15,
        logger: TFLogger = None,
        save_dir="./models/",
        load_dir=None,
    ):
        self.n_epochs = n_epochs
        self.n_iterations = n_iterations
        self.n_validations = n_validations
        self.n_parallels = n_parallels
        self.online_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes)
        self.baseline_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes)
        self.save_dir = save_dir

        self.online_encoder: tf.keras.models.Model = encoder_builder()
        self.online_decoder: tf.keras.models.Model = decoder_builder()
        self.baseline_encoder: tf.keras.models.Model = encoder_builder()
        self.baseline_decoder: tf.keras.models.Model = decoder_builder()

        if load_dir:
            self.load(load_dir)
        self.optimizer = tf.keras.optimizers.Adam(learning_rate)
        self.significance = significance
        self.logger = logger

    def start(self):
        self.build()
        for epoch in range(self.n_epochs):
            for iteration in range(self.n_iterations):
                metrics = self.train_on_episode()
                step = epoch * self.n_iterations + iteration
                if self.logger:
                    self.logger.log(metrics, step)
            if self.validate():
                print(
                    f"Epoch: {epoch}, Validation passed")
                if self.save_dir:
                    self.save(self.save_dir)
                self.synchronize(self.online_encoder, self.baseline_encoder)
                self.synchronize(self.online_decoder, self.baseline_decoder)

    @tf.function
    def train_on_episode(self):
        """train_on_episode executes parallel episodes at the same time and learn from the experiences.
        """
        # ** Initialization ** #

        # Initialize state
        self.online_env.reset()
        # Copy env list for baseline.
        self.baseline_env.import_states(self.online_env.export_states())

        # Greedy rollout
        base_rewards, _ = self.play_game(
            env=self.baseline_env,
            encoder=self.baseline_encoder,
            decoder=self.baseline_decoder,
            greedy=tf.constant(True)
        )
        with tf.GradientTape() as tape:
            # Execute an episode for each online environment
            online_rewards, log_likelihood = self.play_game(
                env=self.online_env,
                encoder=self.online_encoder,
                decoder=self.online_decoder,
                greedy=tf.constant(False)
            )

            # ** Learn from experience ** #

            trainable_variables = self.online_encoder.trainable_variables + \
                self.online_decoder.trainable_variables
            excess_cost = tf.stop_gradient((base_rewards - online_rewards))
            # Get policy gradient to apply to our network
            policy_gradient = tape.gradient(tf.reduce_mean(
                excess_cost * log_likelihood), trainable_variables)

            # Apply gradient
            self.optimizer.apply_gradients(
                zip(policy_gradient, trainable_variables))

        # metrics
        metrics = {
            "cost against baseline": tf.reduce_mean(excess_cost),
            "baseline_rewards": tf.reduce_mean(base_rewards),
            "online_rewards": tf.reduce_mean(online_rewards),
        }

        return metrics

    @tf.function
    def play_game(
        self,
        env: TSPMDP,
        encoder: tf.keras.models.Model,
        decoder: tf.keras.models.Model,
        greedy: tf.Tensor = tf.constant(False)
    ):
        """play games in parallels

        Args:
            envs (tf.Module): list of environments which are RESET.
            network (tf.keras.models.Model): [description]
            greedy (bool, optional): [description]. Defaults to False.

        Returns:
            tuple(tf.Tensor(batch_size), tf.Tensor(batch_size, graph_size, graph_size)):
                rewards, policies
        """

        # ** Initialization ** #
        # Get graph
        dones: tf.Tensor = tf.zeros((self.n_parallels,), dtype=tf.int32)
        ones: tf.Tensor = tf.ones(dones.shape, dtype=tf.int32)
        rewards: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32)
        log_likelihood: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32)
        states: tf.Tensor = env.get_states()
        divisor: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32)
        # shape variables
        shape_B = dones.shape
        # Encode before loop begins
        graph_embeddings = encoder(states[0])
        # Note AutoGraph can't change tensor shape and dtype in while loop
        while tf.math.logical_not(tf.reduce_all(dones == ones)):
            # Get policy
            # B, N
            policies = decoder([graph_embeddings, states[1], states[2]])

            # Determine actions to take
            if greedy:
                actions = tf.argmax(policies, axis=1, output_type=tf.int32)
            else:
                actions = sample_action(policies)
            # Filter to ignore probabilities of actions which won't be taken
            # B, N
            indices = tf.one_hot(
                actions, depth=policies.shape[-1], dtype=tf.float32)
            # Probabilities of choosing the actions
            # B
            sample_log_probability = tf.math.log(
                tf.reduce_sum(indices * policies, axis=-1))

            # Average over log probabilities of sampling the actions
            # B
            new_divisor = divisor + tf.cast(int_not(dones), tf.float32)
            # B
            update = (log_likelihood / new_divisor) * divisor + \
                sample_log_probability / new_divisor - log_likelihood
            # B
            divisor = tf.identity(new_divisor)

            # Calculate average of log probabilities for undone instances
            # B
            log_likelihood = log_likelihood + tf.where(
                dones == ones,
                tf.zeros(log_likelihood.shape, dtype=tf.float32),
                update
            )

            states, new_rewards, dones = env.step(actions)
            rewards = rewards + tf.cast((1-dones), tf.float32) * new_rewards
            # Set shape explicitly to define loop variables' shapes before run
            dones.set_shape(shape_B)
            rewards.set_shape(shape_B)
            log_likelihood.set_shape(shape_B)
            divisor.set_shape(shape_B)

        return rewards, log_likelihood

    def build(self):
        # build
        graph, *other = self.online_env.reset()
        embedding = self.online_encoder(graph)
        self.baseline_encoder(graph)
        inputs = [embedding] + other
        self.online_decoder(inputs)
        self.baseline_decoder(inputs)

    def synchronize(self, original: tf.keras.models.Model, target: tf.keras.models.Model):
        target.set_weights(original.get_weights())

    @ tf.function
    def validate(self):

        # ** Initialization ** #

        # Initialize state
        self.online_env.reset()
        # Copy env list for baseline
        self.baseline_env.import_states(self.online_env.export_states())

        base_rewards, _ = self.play_game(
            env=self.baseline_env,
            encoder=self.baseline_encoder,
            decoder=self.baseline_decoder,
            greedy=tf.constant(True)
        )

        # Execute an episode for each online environment
        online_rewards, _ = self.play_game(
            env=self.online_env,
            encoder=self.online_encoder,
            decoder=self.online_decoder,
            greedy=tf.constant(False)
        )

        return ttest_smaller(base_rewards, online_rewards, self.significance)

    def save(self, path):
        base_path = pathlib.Path(path)
        encoder_path = base_path / "encoder/"
        decoder_path = base_path / "decoder/"
        self.online_encoder.save_weights(encoder_path)
        self.online_decoder.save_weights(decoder_path)

    def load(self, path):
        base_path = pathlib.Path(path)
        encoder_path = base_path / "encoder/"
        decoder_path = base_path / "decoder/"
        self.online_encoder.load_weights(encoder_path)
        self.online_decoder.load_weights(decoder_path)
        self.baseline_encoder.load_weights(encoder_path)
        self.baseline_encoder.load_weights(decoder_path)

    def demo(self, graph_size=None):
        raise NotImplementedError
コード例 #11
0
ファイル: reinforce.py プロジェクト: yotaro-shimose/TSPMDP
    def play_game(
        self,
        env: TSPMDP,
        encoder: tf.keras.models.Model,
        decoder: tf.keras.models.Model,
        greedy: tf.Tensor = tf.constant(False)
    ):
        """play games in parallels

        Args:
            envs (tf.Module): list of environments which are RESET.
            network (tf.keras.models.Model): [description]
            greedy (bool, optional): [description]. Defaults to False.

        Returns:
            tuple(tf.Tensor(batch_size), tf.Tensor(batch_size, graph_size, graph_size)):
                rewards, policies
        """

        # ** Initialization ** #
        # Get graph
        dones: tf.Tensor = tf.zeros((self.n_parallels,), dtype=tf.int32)
        ones: tf.Tensor = tf.ones(dones.shape, dtype=tf.int32)
        rewards: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32)
        log_likelihood: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32)
        states: tf.Tensor = env.get_states()
        divisor: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32)
        # shape variables
        shape_B = dones.shape
        # Encode before loop begins
        graph_embeddings = encoder(states[0])
        # Note AutoGraph can't change tensor shape and dtype in while loop
        while tf.math.logical_not(tf.reduce_all(dones == ones)):
            # Get policy
            # B, N
            policies = decoder([graph_embeddings, states[1], states[2]])

            # Determine actions to take
            if greedy:
                actions = tf.argmax(policies, axis=1, output_type=tf.int32)
            else:
                actions = sample_action(policies)
            # Filter to ignore probabilities of actions which won't be taken
            # B, N
            indices = tf.one_hot(
                actions, depth=policies.shape[-1], dtype=tf.float32)
            # Probabilities of choosing the actions
            # B
            sample_log_probability = tf.math.log(
                tf.reduce_sum(indices * policies, axis=-1))

            # Average over log probabilities of sampling the actions
            # B
            new_divisor = divisor + tf.cast(int_not(dones), tf.float32)
            # B
            update = (log_likelihood / new_divisor) * divisor + \
                sample_log_probability / new_divisor - log_likelihood
            # B
            divisor = tf.identity(new_divisor)

            # Calculate average of log probabilities for undone instances
            # B
            log_likelihood = log_likelihood + tf.where(
                dones == ones,
                tf.zeros(log_likelihood.shape, dtype=tf.float32),
                update
            )

            states, new_rewards, dones = env.step(actions)
            rewards = rewards + tf.cast((1-dones), tf.float32) * new_rewards
            # Set shape explicitly to define loop variables' shapes before run
            dones.set_shape(shape_B)
            rewards.set_shape(shape_B)
            log_likelihood.set_shape(shape_B)
            divisor.set_shape(shape_B)

        return rewards, log_likelihood