def __init__( self, encoder_builder: Callable, decoder_builder: Callable, n_epochs: int = 10000, n_nodes: int = 20, n_iterations: int = 10, n_validations: int = 100, n_parallels: int = 5, learning_rate: float = 1e-5, significance: float = 0.15, logger: TFLogger = None, save_dir="./models/", load_dir=None, ): self.n_epochs = n_epochs self.n_iterations = n_iterations self.n_validations = n_validations self.n_parallels = n_parallels self.online_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes) self.baseline_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes) self.save_dir = save_dir self.online_encoder: tf.keras.models.Model = encoder_builder() self.online_decoder: tf.keras.models.Model = decoder_builder() self.baseline_encoder: tf.keras.models.Model = encoder_builder() self.baseline_decoder: tf.keras.models.Model = decoder_builder() if load_dir: self.load(load_dir) self.optimizer = tf.keras.optimizers.Adam(learning_rate) self.significance = significance self.logger = logger
def raise_at_final(env: TSPMDP, actions: list): env.reset() answer = False for i in range(len(actions)): action = actions[i] try: action = tf.constant([action], dtype=tf.int32) env.step(action) except tf.errors.InvalidArgumentError: if i == len(actions) - 1: answer = True else: break return answer
def complete_synario(env: TSPMDP, actions: list): env.reset() answer = False is_terminal = False for i in range(len(actions)): action = actions[i] try: action = tf.constant([action], dtype=tf.int32) _, _, is_terminal = env.step(action) except tf.errors.InvalidArgumentError: break else: if is_terminal: answer = True return answer
def test_env_mask(): env = TSPMDP(batch_size=1, n_nodes=10) actions = [0] raise_at_final(env, actions) actions = [3, 4, 4] raise_at_final(env, actions) actions = [1, 2, 3, 4, 0] raise_at_final(env, actions) actions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 9]
def test_env_rewards(): batch_size = 1 n_nodes = 4 reward_on_episode = False env = TSPMDP(batch_size=batch_size, n_nodes=n_nodes, reward_on_episode=reward_on_episode) env.reset() state_dict = env.export_states() new_coordinates = tf.constant([[[0., 0.], [0., 1.], [1., 1.], [1., 0.]]]) state_dict["coordinates"] = new_coordinates env.import_states(state_dict) actions = [1, 2, 3] episode_reward = 0 for action in actions: _, reward, _ = env.step(tf.constant([action], dtype=tf.int32)) episode_reward += reward assert episode_reward == -n_nodes
def test_env_reward_on_episode(): B, N = 128, 100 original = TSPMDP(batch_size=B, n_nodes=N, reward_on_episode=True) original.reset() copy = TSPMDP(batch_size=B, n_nodes=N, reward_on_episode=False) init_state = original.export_states() init_state.pop("rewards") copy_sum = 0 copy.import_states(init_state) actions = tf.constant([list(range(1, N)) for _ in range(B)], dtype=tf.int32) original_reward = None for j in range(actions.shape[1]): action = actions[:, j] _, original_reward, _ = original.step(action) _, copy_reward, _ = copy.step(action) assert tf.reduce_all( original_reward == 0.) or j == actions.shape[1] - 1 copy_sum += copy_reward tf.assert_equal(original_reward, copy_sum)
def test_env_synchronization(): original = TSPMDP(batch_size=1, n_nodes=10) original.reset() copy = TSPMDP(batch_size=1, n_nodes=10) copy.import_states(original.export_states()) original_sum = 0 copy_sum = 0 actions = [1, 2, 3, 4, 5, 6, 7, 8, 9] for i in range(len(actions)): action = actions[i] action = tf.constant([action], dtype=tf.int32) _, original_reward, _ = original.step(action) _, copy_reward, _ = copy.step(action) original_sum += original_reward copy_sum += copy_reward assert original_sum == copy_sum
def test_env_complete(): env = TSPMDP(batch_size=1, n_nodes=10) actions = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] complete_synario(env, actions)
def __call__(self): return TSPMDP(batch_size=self.batch_size, n_nodes=self.n_nodes, reward_on_episode=self.reward_on_episode)
class Reinforce: def __init__( self, encoder_builder: Callable, decoder_builder: Callable, n_epochs: int = 10000, n_nodes: int = 20, n_iterations: int = 10, n_validations: int = 100, n_parallels: int = 5, learning_rate: float = 1e-5, significance: float = 0.15, logger: TFLogger = None, save_dir="./models/", load_dir=None, ): self.n_epochs = n_epochs self.n_iterations = n_iterations self.n_validations = n_validations self.n_parallels = n_parallels self.online_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes) self.baseline_env = TSPMDP(batch_size=n_parallels, n_nodes=n_nodes) self.save_dir = save_dir self.online_encoder: tf.keras.models.Model = encoder_builder() self.online_decoder: tf.keras.models.Model = decoder_builder() self.baseline_encoder: tf.keras.models.Model = encoder_builder() self.baseline_decoder: tf.keras.models.Model = decoder_builder() if load_dir: self.load(load_dir) self.optimizer = tf.keras.optimizers.Adam(learning_rate) self.significance = significance self.logger = logger def start(self): self.build() for epoch in range(self.n_epochs): for iteration in range(self.n_iterations): metrics = self.train_on_episode() step = epoch * self.n_iterations + iteration if self.logger: self.logger.log(metrics, step) if self.validate(): print( f"Epoch: {epoch}, Validation passed") if self.save_dir: self.save(self.save_dir) self.synchronize(self.online_encoder, self.baseline_encoder) self.synchronize(self.online_decoder, self.baseline_decoder) @tf.function def train_on_episode(self): """train_on_episode executes parallel episodes at the same time and learn from the experiences. """ # ** Initialization ** # # Initialize state self.online_env.reset() # Copy env list for baseline. self.baseline_env.import_states(self.online_env.export_states()) # Greedy rollout base_rewards, _ = self.play_game( env=self.baseline_env, encoder=self.baseline_encoder, decoder=self.baseline_decoder, greedy=tf.constant(True) ) with tf.GradientTape() as tape: # Execute an episode for each online environment online_rewards, log_likelihood = self.play_game( env=self.online_env, encoder=self.online_encoder, decoder=self.online_decoder, greedy=tf.constant(False) ) # ** Learn from experience ** # trainable_variables = self.online_encoder.trainable_variables + \ self.online_decoder.trainable_variables excess_cost = tf.stop_gradient((base_rewards - online_rewards)) # Get policy gradient to apply to our network policy_gradient = tape.gradient(tf.reduce_mean( excess_cost * log_likelihood), trainable_variables) # Apply gradient self.optimizer.apply_gradients( zip(policy_gradient, trainable_variables)) # metrics metrics = { "cost against baseline": tf.reduce_mean(excess_cost), "baseline_rewards": tf.reduce_mean(base_rewards), "online_rewards": tf.reduce_mean(online_rewards), } return metrics @tf.function def play_game( self, env: TSPMDP, encoder: tf.keras.models.Model, decoder: tf.keras.models.Model, greedy: tf.Tensor = tf.constant(False) ): """play games in parallels Args: envs (tf.Module): list of environments which are RESET. network (tf.keras.models.Model): [description] greedy (bool, optional): [description]. Defaults to False. Returns: tuple(tf.Tensor(batch_size), tf.Tensor(batch_size, graph_size, graph_size)): rewards, policies """ # ** Initialization ** # # Get graph dones: tf.Tensor = tf.zeros((self.n_parallels,), dtype=tf.int32) ones: tf.Tensor = tf.ones(dones.shape, dtype=tf.int32) rewards: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32) log_likelihood: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32) states: tf.Tensor = env.get_states() divisor: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32) # shape variables shape_B = dones.shape # Encode before loop begins graph_embeddings = encoder(states[0]) # Note AutoGraph can't change tensor shape and dtype in while loop while tf.math.logical_not(tf.reduce_all(dones == ones)): # Get policy # B, N policies = decoder([graph_embeddings, states[1], states[2]]) # Determine actions to take if greedy: actions = tf.argmax(policies, axis=1, output_type=tf.int32) else: actions = sample_action(policies) # Filter to ignore probabilities of actions which won't be taken # B, N indices = tf.one_hot( actions, depth=policies.shape[-1], dtype=tf.float32) # Probabilities of choosing the actions # B sample_log_probability = tf.math.log( tf.reduce_sum(indices * policies, axis=-1)) # Average over log probabilities of sampling the actions # B new_divisor = divisor + tf.cast(int_not(dones), tf.float32) # B update = (log_likelihood / new_divisor) * divisor + \ sample_log_probability / new_divisor - log_likelihood # B divisor = tf.identity(new_divisor) # Calculate average of log probabilities for undone instances # B log_likelihood = log_likelihood + tf.where( dones == ones, tf.zeros(log_likelihood.shape, dtype=tf.float32), update ) states, new_rewards, dones = env.step(actions) rewards = rewards + tf.cast((1-dones), tf.float32) * new_rewards # Set shape explicitly to define loop variables' shapes before run dones.set_shape(shape_B) rewards.set_shape(shape_B) log_likelihood.set_shape(shape_B) divisor.set_shape(shape_B) return rewards, log_likelihood def build(self): # build graph, *other = self.online_env.reset() embedding = self.online_encoder(graph) self.baseline_encoder(graph) inputs = [embedding] + other self.online_decoder(inputs) self.baseline_decoder(inputs) def synchronize(self, original: tf.keras.models.Model, target: tf.keras.models.Model): target.set_weights(original.get_weights()) @ tf.function def validate(self): # ** Initialization ** # # Initialize state self.online_env.reset() # Copy env list for baseline self.baseline_env.import_states(self.online_env.export_states()) base_rewards, _ = self.play_game( env=self.baseline_env, encoder=self.baseline_encoder, decoder=self.baseline_decoder, greedy=tf.constant(True) ) # Execute an episode for each online environment online_rewards, _ = self.play_game( env=self.online_env, encoder=self.online_encoder, decoder=self.online_decoder, greedy=tf.constant(False) ) return ttest_smaller(base_rewards, online_rewards, self.significance) def save(self, path): base_path = pathlib.Path(path) encoder_path = base_path / "encoder/" decoder_path = base_path / "decoder/" self.online_encoder.save_weights(encoder_path) self.online_decoder.save_weights(decoder_path) def load(self, path): base_path = pathlib.Path(path) encoder_path = base_path / "encoder/" decoder_path = base_path / "decoder/" self.online_encoder.load_weights(encoder_path) self.online_decoder.load_weights(decoder_path) self.baseline_encoder.load_weights(encoder_path) self.baseline_encoder.load_weights(decoder_path) def demo(self, graph_size=None): raise NotImplementedError
def play_game( self, env: TSPMDP, encoder: tf.keras.models.Model, decoder: tf.keras.models.Model, greedy: tf.Tensor = tf.constant(False) ): """play games in parallels Args: envs (tf.Module): list of environments which are RESET. network (tf.keras.models.Model): [description] greedy (bool, optional): [description]. Defaults to False. Returns: tuple(tf.Tensor(batch_size), tf.Tensor(batch_size, graph_size, graph_size)): rewards, policies """ # ** Initialization ** # # Get graph dones: tf.Tensor = tf.zeros((self.n_parallels,), dtype=tf.int32) ones: tf.Tensor = tf.ones(dones.shape, dtype=tf.int32) rewards: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32) log_likelihood: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32) states: tf.Tensor = env.get_states() divisor: tf.Tensor = tf.zeros(dones.shape, dtype=tf.float32) # shape variables shape_B = dones.shape # Encode before loop begins graph_embeddings = encoder(states[0]) # Note AutoGraph can't change tensor shape and dtype in while loop while tf.math.logical_not(tf.reduce_all(dones == ones)): # Get policy # B, N policies = decoder([graph_embeddings, states[1], states[2]]) # Determine actions to take if greedy: actions = tf.argmax(policies, axis=1, output_type=tf.int32) else: actions = sample_action(policies) # Filter to ignore probabilities of actions which won't be taken # B, N indices = tf.one_hot( actions, depth=policies.shape[-1], dtype=tf.float32) # Probabilities of choosing the actions # B sample_log_probability = tf.math.log( tf.reduce_sum(indices * policies, axis=-1)) # Average over log probabilities of sampling the actions # B new_divisor = divisor + tf.cast(int_not(dones), tf.float32) # B update = (log_likelihood / new_divisor) * divisor + \ sample_log_probability / new_divisor - log_likelihood # B divisor = tf.identity(new_divisor) # Calculate average of log probabilities for undone instances # B log_likelihood = log_likelihood + tf.where( dones == ones, tf.zeros(log_likelihood.shape, dtype=tf.float32), update ) states, new_rewards, dones = env.step(actions) rewards = rewards + tf.cast((1-dones), tf.float32) * new_rewards # Set shape explicitly to define loop variables' shapes before run dones.set_shape(shape_B) rewards.set_shape(shape_B) log_likelihood.set_shape(shape_B) divisor.set_shape(shape_B) return rewards, log_likelihood