def __init__(self, config, environment, policy): """ :type config: app.config.model.ModelConfig :type environment: app.environments.environment.Environment :type policy: app.policies.policy.Policy """ q_output_size = 1 q_input_size = environment.observation_size + environment.action_size hidden_size = config.network.hidden_size number_of_hidden_layers = config.network.number_of_hidden_layers self.target_entropy = -environment.action_size self.log_alpha = torch.zeros(1, requires_grad=True) self.policy = policy self.reward_scale = config.reward_scale self.discount_factor = config.discount_factor self.exponential_weight = config.exponential_weight self.q1 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.q2 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.target_q1 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.target_q2 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.q_criterion = nn.MSELoss() self.alpha_optimizer = optim.Adam([self.log_alpha], lr=config.learning_rate_policy) self.policy_optimizer = optim.Adam(policy.parameters(), lr=config.learning_rate_policy) self.q1_optimizer = optim.Adam(self.q1.parameters(), lr=config.learning_rate_q) self.q2_optimizer = optim.Adam(self.q2.parameters(), lr=config.learning_rate_q)
def load_network(self, key): lc.log.info("network: {}".format(key)) if (key[:3] == 'gen'): print(key[4]) self.network = Network.generate(int(key[4])) else: self.network = self.networks[key]
def b2(): a, b = Node(7 + 3, 1 + 3), Node(7 + 3, 4 + 3) c, d = Node(4 + 3, 4 + 3), Node(10 + 3, 4 + 3) e = Node(10 + 3, 7 + 3) return Network([a, b, c, d, e], [Link(a, b), Link(b, c), Link(b, d), Link(d, e)])
def __init__(self, config, environment): """ :type config: app.config.policies.CategoricalPolicyConfig :type environment: app.config.environments.EnvironmentConfig """ super(Policy, self).__init__() input_size = environment.observation_size output_size = len(config.actions) hidden_size = config.network.hidden_size number_of_hidden_layers = config.network.number_of_hidden_layers self.actions = torch.tensor(config.actions) self.network = Network(input_size, hidden_size, output_size, number_of_hidden_layers)
def __init__(self, config, environment, deterministic=False): """ :type config: app.config.policies.GaussianPolicyConfig :type environment: app.config.environments.EnvironmentConfig """ super(Policy, self).__init__() input_size = environment.observation_size output_size = 2 # mean and log(std) hidden_size = config.network.hidden_size number_of_hidden_layers = config.network.number_of_hidden_layers self.deterministic = deterministic self.network = Network(input_size, hidden_size, output_size, number_of_hidden_layers)
def triangle_network(): node_a = Node(3, 3) node_b = Node(12, 6) node_c = Node(6, 12) node_e = Node(6, 6) node_f = Node(14, 14) link_1 = Link(node_a, node_b, 'AB') link_2 = Link(node_b, node_c, 'BC') link_3 = Link(node_c, node_a, 'AC') return Network([node_a, node_b, node_c, node_e, node_f], [link_1, link_2, link_3])
def all_networks(): return { # "one node": one_node_network(), # "two nodes": two_nodes_network(), # "triangle": triangle_network(), # "horizontal line": horizontal_network(), # "vertical line": vertical_network(), # "whole network": whole_network(), "default": Network(), "l": l_network(), "gen 0 connection": None, "gen 1 connection": None, "gen 2 connections": None, "gen 3 connections": None }
def whole_network(): a, b, c, d = Node(3, 4), Node(10.5, 4), Node(6, 8), Node(10.5, 8) e, f, g, h = Node(16.5, 8), Node(18, 6), Node(3, 16), Node(13.5, 12) i, j = Node(16.5, 16), Node(3, 12) return Network([a, b, c, d, e, f, g, h, i, j], [ Link(a, b), Link(a, c), Link(a, j), Link(b, d), Link(c, d), Link(d, h), Link(d, e), Link(e, f), Link(h, i), Link(j, g), Link(j, h), Link(h, e) ])
def exp1_network(): home = Node(5, 8) cinema = Node(8, 7) police = Node(7, 3) market = Node(14, 5) city_hall = Node(15, 10) school = Node(7, 12) tower = Node(3, 15) links = [ Link(home, cinema), Link(home, police), Link(cinema, police), Link(police, market), Link(market, city_hall), Link(school, city_hall), Link(home, school) ] return Network([home, cinema, police, market, city_hall, school, tower], links)
def conc1_network(): sophie = Node(6, 2) mathiew = Node(2, 6) jean = Node(10, 6) martin = Node(2, 10) harold = Node(10, 10) helene = Node(14, 10) julie = Node(10, 14) marie = Node(14, 14) links = [ Link(sophie, mathiew), Link(sophie, jean), Link(jean, mathiew), Link(martin, mathiew), Link(jean, harold), Link(jean, helene), Link(mathiew, harold), Link(harold, julie), Link(helene, marie), Link(julie, marie) ] return Network( [sophie, mathiew, jean, martin, harold, helene, julie, marie], links)
def w(): a, b = Node(10, 10), Node(8.5, 10) return Network([a, b], [Link(a, b)])
def s(): a, b = Node(10, 10), Node(10, 11.5) return Network([a, b], [Link(a, b)])
def l_network(): a = Node(10, 10, 'Paris') b, c = Node(10, 7, 'Lille'), Node(13, 10, 'Strasbourg') return Network([a, b, c], [Link(a, b), Link(a, c)])
def f2(): c, d = Node(10, 3), Node(10, 6) e, f, g = Node(8 + 5, 6), Node(3 + 5, 8), Node(6 + 5, 8) return Network([c, d, e, f, g], [Link(c, d), Link(d, e), Link(d, f), Link(f, g)])
def d2(): c, d = Node(4 + 3, 4), Node(6 + 3, 6) e, f, g = Node(6 + 3, 9), Node(8 + 3, 7), Node(9 + 3, 9) return Network([c, d, e, f, g], [Link(c, d), Link(d, e), Link(e, f), Link(e, g)])
def e2(): c, d = Node(4 + 4, 4), Node(6 + 4, 6) e, f, g = Node(6 + 4, 9), Node(3 + 4, 9), Node(9 + 4, 9) return Network([c, d, e, f, g], [Link(c, d), Link(d, e), Link(e, f), Link(e, g)])
class Model: """ Soft Actor-Critic model. """ def __init__(self, config, environment, policy): """ :type config: app.config.model.ModelConfig :type environment: app.environments.environment.Environment :type policy: app.policies.policy.Policy """ q_output_size = 1 q_input_size = environment.observation_size + environment.action_size hidden_size = config.network.hidden_size number_of_hidden_layers = config.network.number_of_hidden_layers self.target_entropy = -environment.action_size self.log_alpha = torch.zeros(1, requires_grad=True) self.policy = policy self.reward_scale = config.reward_scale self.discount_factor = config.discount_factor self.exponential_weight = config.exponential_weight self.q1 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.q2 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.target_q1 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.target_q2 = Network(q_input_size, hidden_size, q_output_size, number_of_hidden_layers, nn.ReLU()) self.q_criterion = nn.MSELoss() self.alpha_optimizer = optim.Adam([self.log_alpha], lr=config.learning_rate_policy) self.policy_optimizer = optim.Adam(policy.parameters(), lr=config.learning_rate_policy) self.q1_optimizer = optim.Adam(self.q1.parameters(), lr=config.learning_rate_q) self.q2_optimizer = optim.Adam(self.q2.parameters(), lr=config.learning_rate_q) def train_batch(self, observations, next_observations, actions, rewards, terminals): """ Forward pass. Assumes inputs are torch tensors. """ alpha = self.log_alpha.exp() alpha_detached = alpha.detach() policy_actions, policy_log_probability = self.policy(observations) policy_next_actions, policy_next_log_probability = self.policy( next_observations) # concatenate observations and corresponding actions observation_actions = torch.cat((observations, actions), dim=1) observation_policy_actions = torch.cat((observations, policy_actions), dim=1) next_observation_policy_next_actions = torch.cat( (next_observations, policy_next_actions), dim=1) # q-values q1_policy_actions = self.q1(observation_policy_actions) q2_policy_actions = self.q2(observation_policy_actions) q_policy_actions = torch.min(q1_policy_actions, q2_policy_actions) q1_actions = self.q1(observation_actions) q2_actions = self.q2(observation_actions) # target q-values target_q1_policy_next_actions = self.target_q1( next_observation_policy_next_actions) target_q2_policy_next_actions = self.target_q2( next_observation_policy_next_actions) target_q_policy_next_actions = torch.min( target_q1_policy_next_actions, target_q2_policy_next_actions) value_next_observation = target_q_policy_next_actions - alpha_detached * policy_next_log_probability q_target = self.reward_scale * rewards + ( 1.0 - terminals) * self.discount_factor * value_next_observation # losses q1_loss = self.q_criterion(q1_actions, q_target.detach()) q2_loss = self.q_criterion(q2_actions, q_target.detach()) policy_loss = (alpha_detached * policy_log_probability - q_policy_actions).mean() alpha_loss = -( alpha * (policy_log_probability + self.target_entropy).detach()).mean() # optimize self.optimize(self.q1_optimizer, q1_loss) self.optimize(self.q2_optimizer, q2_loss) self.optimize(self.policy_optimizer, policy_loss) self.optimize(self.alpha_optimizer, alpha_loss) self.update_exponential_moving_target(self.q1, self.target_q1) self.update_exponential_moving_target(self.q2, self.target_q2) return policy_loss.detach().numpy(), q1_loss.detach().numpy( ), q2_loss.detach().numpy(), alpha_loss.detach().numpy() def get_action(self, observation): """ Computes next action. Assumes input is a numpy array. """ return self.policy.get_action(observation) def optimize(self, optimizer, loss): optimizer.zero_grad() loss.backward() optimizer.step() def copy_parameters(self, source, target): for source_param, target_param in zip(source.parameters(), target.parameters()): target_param.data.copy_(source_param.data) def update_exponential_moving_target(self, q, target): for q_param, target_param in zip(q.parameters(), target.parameters()): q_contribution = self.exponential_weight * q_param.data target_contribution = (1.0 - self.exponential_weight) * target_param.data target_param_new = q_contribution + target_contribution target_param.data.copy_(target_param_new) def eval_mode(self): self.policy.eval() self.q1.eval() self.q2.eval() self.target_q1.eval() self.target_q2.eval() def train_mode(self): self.policy.train() self.q1.train() self.q2.train() self.target_q1.train() self.target_q2.train() def load(self, path): self.policy.load_state_dict(torch.load(os.path.join(path, 'policy.pt'))) self.q1.load_state_dict(torch.load(os.path.join(path, 'q1.pt'))) self.q2.load_state_dict(torch.load(os.path.join(path, 'q2.pt'))) self.copy_parameters(self.q1, self.target_q1) self.copy_parameters(self.q2, self.target_q2) def save(self, path): torch.save(self.policy.state_dict(), os.path.join(path, 'policy.pt')) torch.save(self.q1.state_dict(), os.path.join(path, 'q1.pt')) torch.save(self.q2.state_dict(), os.path.join(path, 'q2.pt'))
def one_node_network(): return Network([Node(5, 5)], [])
def nw(): a, b = Node(10, 10), Node(9, 9) return Network([a, b], [Link(a, b)])
def horizontal_network(): a = Node(6, 8) b = Node(10, 8) return Network([a, b], [Link(a, b)])
def a2(): c, d = Node(10, 4), Node(8, 6) e, f, g = Node(8, 9), Node(10, 7), Node(11, 9) return Network([c, d, e, f, g], [Link(c, d), Link(d, e), Link(e, g), Link(e, f)])
def vertical_network(): a = Node(10, 6) b = Node(10, 10) return Network([a, b], [Link(a, b)])
def c2(): c, d = Node(10, 4), Node(8, 6) e, f, g = Node(10, 8), Node(8, 9), Node(5, 9) return Network([c, d, e, f, g], [Link(c, d), Link(d, e), Link(d, f), Link(f, g)])
def se(): a, b = Node(10, 10), Node(11, 11) return Network([a, b], [Link(a, b)])
def two_nodes_network(): return Network([Node(5, 5), Node(10, 10)], [])
def f1(): c, d = Node(7, 6), Node(10, 6) e, f, g = Node(13, 6), Node(8, 8), Node(11, 8) return Network([c, d, e, f, g], [Link(c, d), Link(d, e), Link(d, f), Link(f, g)])