def __init__(self, index, world_size, states, num_agents, collision_distance): self.index = index self.name = 'q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.action_traj = [] self.num_collisions = 0 self.collision_distance = collision_distance self.dimensions = [2, 5, 5, 4] self.dqn = MLP(self.dimensions).double() self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.RMSprop(self.dqn.parameters()) self.lr = 1e-4 self.num_collisions = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0
def __init__( self, initial_feed_candidates, user_features, feed_counts, agent_name: str, feed_feature_count = 6, user_feature_count = 6, model_dims: List[int] = [50, 25], lr: float = 1e-3, boltzmann: bool = True, epsilon: float = 0.05, batch_size: int = 128, ): self.initial_feed_candidates = initial_feed_candidates self.current_feed_candidates = initial_feed_candidates self.user_features = user_features self.feed_counts = feed_counts self.agent_name = agent_name self.interest_level = 0 self.cum_rewards: float = 0. self.feed_feature_count = feed_feature_count self.user_feature_count = user_feature_count self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count self.training_data: ReplayMemory = ReplayMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [1] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.target_net = MLP(self.model_dims).double().to(device) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.boltzmann: bool = boltzmann self.epsilon: float = epsilon self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_actions = [] self.latest_feature = None self.current_feed = 0 self.cum_reward_history: List[float] = []
def __init__(self, initial_feed_candidates, user_features, feed_counts: int, agent_name: str, feed_feature_count=6, user_feature_count=6, model_dims=[50, 25], batch_size: int = 128, interest_unknown: bool = False, boltzmann: bool = True): self.initial_feed_candidates = initial_feed_candidates self.current_feed_candidates = initial_feed_candidates self.user_features = user_features self.feed_counts = feed_counts self.agent_name = agent_name self.cum_rewards: float = 0. self.rewards = [] self.actions = [] self.training_data = [] self.feed_feature_count = feed_feature_count self.user_feature_count = user_feature_count self.num_features: int = feed_counts * feed_feature_count + feed_feature_count + user_feature_count self.buffer: SupervisedMemory = SupervisedMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [1] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) self.boltzmann: bool = boltzmann self.epsilon: float = 0.05 self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_actions = [] self.latest_feature = None self.current_feed = 0 self.cum_reward_history: List[float] = []
def __init__( self, feed_units: List[int], agent_name: str, model_dims: List[int] = [], lr: float = 1e-3, boltzmann: bool = False, epsilon: float = 0.05, batch_size: int = 128, ): self.feed_units = copy.deepcopy(feed_units) self.agent_name = agent_name self.interest_level = 0 self.cum_rewards: float = 0. self.num_features: int = len(feed_units) self.training_data: ReplayMemory = ReplayMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [2] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.target_net = MLP(self.model_dims).double().to(device) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.boltzmann: bool = boltzmann self.epsilon: float = epsilon self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_unit_indices: List[int] = [] self.latest_feature = None self.latest_action = None self.current_feed = 0 self.cum_reward_history: List[float] = [] self.current_loc = [0, 0]
def __init__(self, embedding_size=128, num_layers=2, bidirectional=True, cuda=False): """Create a new RNNModel object based on the specifications embedding_size -- size of each RNN embedding num_layers -- number of RNN layers bidirectional -- whether the RNN is bidirectional cuda -- whether to use GPU """ super(RNNModel, self).__init__() self._num_layers = num_layers self._embedding_size = embedding_size self._hidden_size = embedding_size if bidirectional: self._hidden_size //= 2 # create an embedding for the tokens self.embedding = nn.Embedding(len(RNN_TOKENS), embedding_size) # create a separate LSTM model for each relation type self.lstms = {} for relation_type in RNN_RELATIONS: lstm = nn.LSTM(embedding_size, self._hidden_size, self._num_layers, bidirectional=bidirectional) self.lstms[relation_type] = lstm self.add_module("lstm_%s" % relation_type, lstm) # create a scoring MLP self.score = MLP([embedding_size, 1]) # check CUDA self._cuda = cuda if self._cuda: self.cuda()
def __init__(self, index, reward_threshold, collision_threshold, world_size, states, num_agents, collision_distance): self.index = index self.name = 'multi safe q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \ + WhiteKernel(noise_level=1) self.reward_gp = GaussianProcessRegressor(kernel=kernel) self.reward_threshold = reward_threshold self.collision_threshold = collision_threshold self.collision_distance = collision_distance self.trajs = [[] for _ in range(num_agents)] self.my_states = [] self.action_traj = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.dimensions = [3, 50, 50, 7] self.dqn = MLP(self.dimensions).double() self.dqn_l = MLP(self.dimensions).double() self.dqn_u = MLP(self.dimensions).double() self.optimizer = optim.RMSprop(self.dqn.parameters()) self.optimizer_l = optim.RMSprop(self.dqn_l.parameters()) self.optimizer_u = optim.RMSprop(self.dqn_u.parameters()) self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_l = MLP(self.dimensions).double() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() self.target_u = MLP(self.dimensions).double() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.lr = 1e-3 self.epsilons = [0. for _ in range(num_agents)] self.tau_exploits = [1. for _ in range(num_agents)] self.tau_explores = [1. for _ in range(num_agents)] self.num_collisions = 0 self.num_unsafe = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0
class MultiAgentPlanner(): def __init__(self, index, reward_threshold, collision_threshold, world_size, states, num_agents, collision_distance): self.index = index self.name = 'multi safe q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] kernel = RBF(length_scale=world_size, length_scale_bounds=[(1e1, 1e5), (1e1, 1e5), (1e1, 1e5)]) \ + WhiteKernel(noise_level=1) self.reward_gp = GaussianProcessRegressor(kernel=kernel) self.reward_threshold = reward_threshold self.collision_threshold = collision_threshold self.collision_distance = collision_distance self.trajs = [[] for _ in range(num_agents)] self.my_states = [] self.action_traj = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.dimensions = [3, 50, 50, 7] self.dqn = MLP(self.dimensions).double() self.dqn_l = MLP(self.dimensions).double() self.dqn_u = MLP(self.dimensions).double() self.optimizer = optim.RMSprop(self.dqn.parameters()) self.optimizer_l = optim.RMSprop(self.dqn_l.parameters()) self.optimizer_u = optim.RMSprop(self.dqn_u.parameters()) self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_l = MLP(self.dimensions).double() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() self.target_u = MLP(self.dimensions).double() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.lr = 1e-3 self.epsilons = [0. for _ in range(num_agents)] self.tau_exploits = [1. for _ in range(num_agents)] self.tau_explores = [1. for _ in range(num_agents)] self.num_collisions = 0 self.num_unsafe = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore=False): possible_actions = copy.copy(Action.SET) action_next_states = [] reward_ls = [] reward_uncertainty = [] no_collision_ls = [1.0 for _ in Action.SET] best_action = Action.STAY # best_action = np.argmax(self.target(torch.tensor(self.states[self.index])).tolist()) # # if explore or np.random.binomial(1, self.eps) == 1: # best_action = possible_actions[np.random.choice(len(possible_actions))] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) reward, std = self.reward_gp.predict(np.array([next_state]), return_std=True) reward = reward[0] std = std[0] action_next_states += [(a, next_state)] reward_ls += [reward - self.beta * std] reward_uncertainty += [std] for action, next_state in action_next_states: for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] for agent_action in Action.SET: possible_next_agent_state = cur_agent_state + get_movement( agent_action) if np.linalg.norm(possible_next_agent_state - next_state) < self.collision_distance: continue a_prob = self._get_policy(agent, agent_action) no_collision_ls[action] *= (1 - a_prob) # for i, l in enumerate(reward_ls): # if l <= self.reward_threshold or not self._returnable(action_next_states[i][1]): # possible_actions.remove(i) for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) possible_actions = list(possible_actions) if explore or np.random.binomial(1, self.eps) == 1: # most_uncertain_action = Action.STAY # largest_uncertainty = -math.inf # for action in possible_actions: # if reward_uncertainty[action] > largest_uncertainty: # most_uncertain_action = action # largest_uncertainty = reward_uncertainty[action] # # best_action = most_uncertain_action if len(possible_actions) > 0: best_action = possible_actions[np.random.choice( len(possible_actions))] else: best_q_action = Action.STAY best_q = -math.inf q_values = self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist() for action in possible_actions: if q_values[action] > best_q: best_q_action = action best_q = q_values[action] best_action = best_q_action if len(possible_actions) == 0: # joint_prob = np.array(reward_ls) * np.array(no_collision_ls) # best_action = np.argmax(joint_prob) best_action = np.argmax(no_collision_ls) self.action_traj += [best_action] return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) if len(self.rewards) > 50: self.rewards.pop(0) self.my_states.pop(0) self.rewards += [reward] self.states = states for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) > self.collision_distance: self.num_collisions += 1 break if reward < self.reward_threshold: self.num_unsafe += 1 for i in range(self.num_agents): self.trajs[i] += [states[i]] if i == self.index: self.my_states += [states[i]] self.cum_rewards += reward def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() self.trajs = [[] for _ in range(self.num_agents)] self.action_traj = [] self.states = states # self.epsilons = [0. for _ in range(self.num_agents)] # self.tau_exploits = [1. for _ in range(self.num_agents)] # self.tau_explores = [1. for _ in range(self.num_agents)] # self.rewards = [] # self.cum_rewards = 0 def learn_from_buffer(self): self.reward_gp.fit(self.my_states, self.rewards) self._value_func_estimate() for agent in range(self.num_agents): if agent == self.index: continue self._optimize_parameters(agent) def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) reward_batch = torch.cat(batch.reward) reward_l_batch = [] reward_u_batch = [] for state in state_batch: cur_state = state.tolist() reward, std = self.reward_gp.predict(np.array([cur_state]), True) reward_l_batch.append( torch.tensor([reward[0] - self.beta * std[0]], dtype=torch.double)) reward_u_batch.append( torch.tensor([reward[0] + self.beta * std[0]], dtype=torch.double)) reward_l_batch = torch.cat(reward_l_batch) reward_u_batch = torch.cat(reward_u_batch) action_batch = torch.cat(batch.action) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() state_action_values = self.dqn_u(state_batch).gather(1, action_batch) next_state_values = self.target_u(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_u_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer_u.zero_grad() loss.backward() for param in self.dqn_u.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_u.step() state_action_values = self.dqn_l(state_batch).gather(1, action_batch) next_state_values = self.target_l(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_l_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer_l.zero_grad() loss.backward() for param in self.dqn_l.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer_l.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.target_u.load_state_dict(self.dqn_u.state_dict()) self.target_u.eval() self.target_l.load_state_dict(self.dqn_l.state_dict()) self.target_l.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state def _get_policy(self, agent, action): epsilon = self.epsilons[agent] tau_explore = self.tau_explores[agent] tau_exploit = self.tau_exploits[agent] return self._compute_policy_upperbound(epsilon, tau_explore, tau_exploit, agent, action) def _returnable(self, state): for a in Action.SET: next_state = self._move_coordinate(state, a) reward, std = self.reward_gp.predict(np.array([next_state]), True) reward = reward[0] std = std[0] if reward - self.beta * std >= self.reward_threshold: return True return False def _optimize_parameters(self, agent): traj = self.trajs[agent] if len(traj) > 20: traj = traj[len(traj) - 20:] def _compute_log_likelihood(parameters): epsilon = parameters[0] tau_explore = parameters[1] tau_exploit = parameters[2] sum_log_likelihood = 1.0 for step in range(1, len(traj)): prev_state = traj[step - 1] cur_state = traj[step] movement = np.rint(cur_state - prev_state) action = get_action(movement, self.world_size) if action == -1: continue sum_log_likelihood *= (self._compute_policy_upperbound( epsilon, tau_explore, tau_exploit, agent, action)) return -np.log(sum_log_likelihood) res = minimize(_compute_log_likelihood, np.array([0.5, 1.0, 1.0]), method='L-BFGS-B', bounds=np.array([(1e-6, 1.0), (0.1, 10.0), (0.1, 10.0)])) if not np.all(np.equal(res.x, np.array([0.5, 1.0, 1.0]))): self.epsilons[agent] = res.x[0] self.tau_explores[agent] = res.x[1] self.tau_exploits[agent] = res.x[2] def _compute_policy_upperbound(self, epsilon, tau_explore, tau_exploit, agent, action): q = self.dqn(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() q_u = self.dqn_u(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() q_l = self.dqn_l(torch.tensor(self.states[agent], dtype=torch.double)).detach().numpy() ofu_denom = copy.copy(q) ofu_denom[action] = q_u[action] boltz_denom = copy.copy(q_l) boltz_denom[action] = q[action] explore_mean_q = np.mean(q_u / tau_explore) prob_ofu = np.exp(q_u[action] / tau_explore - explore_mean_q) / np.sum( np.exp(ofu_denom / tau_explore - explore_mean_q)) exploit_mean_q = np.mean(q / tau_exploit) prob_boltz = np.exp(q[action] / tau_exploit - exploit_mean_q) / np.sum( np.exp(boltz_denom / tau_exploit - exploit_mean_q)) return epsilon * prob_ofu + (1 - epsilon) * prob_boltz
def __init__(self, feed_units: List[int], agent_name: str, ensemble_size: int = 100, prior_variance: float = 1.0, model_dims: List[int] = [20], lr: float = 1e-3, batch_size: int = 128, noise_variance=0): self.feed_units = copy.deepcopy(feed_units) # self.available_units = copy.deepcopy(feed_units) self.agent_name = agent_name self.cum_rewards: float = 0. self.interest_level = 0. self.num_features: int = len(feed_units) + 1 self.noise_variance = noise_variance self.ensemble_size: int = ensemble_size self.training_data = ReplayMemory(100000) # self.training_datas = [] # for i in range(self.ensemble_size): # self.training_datas.append(ReplayMemory(100000)) self.latest_feature = None self.latest_action = None self.prior_variance = prior_variance self.model_dims: List[int] = [self.num_features] + model_dims + [2] priors = [] for i in range(self.ensemble_size): priors.append(MLP(self.model_dims)) priors[i].initialize() priors[i].double() priors[i].eval() priors[i].to(device) self.models: List[DQNWithPrior] = [] for i in range(self.ensemble_size): self.models.append( DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance))) self.models[i].initialize() self.models[i].double() self.models[i].to(device) self.target_nets: List[DQNWithPrior] = [] for i in range(self.ensemble_size): self.target_nets.append( DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance))) self.target_nets[i].load_state_dict(self.models[i].state_dict()) self.target_nets[i].double() self.target_nets[i].eval() self.target_nets[i].to(device) self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizers = [] for i in range(self.ensemble_size): self.optimizers.append( optim.Adam(self.models[i].parameters(), lr=lr)) self.cur_net = self.target_nets[np.random.choice(self.ensemble_size)] self.batch_size = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_unit_indices: List[int] = [] self.cum_reward_history: List[float] = [] self.current_feed = 0
class QLearningAgent(): def __init__(self, index, world_size, states, num_agents, collision_distance): self.index = index self.name = 'q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.action_traj = [] self.num_collisions = 0 self.collision_distance = collision_distance self.dimensions = [2, 5, 5, 4] self.dqn = MLP(self.dimensions).double() self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.RMSprop(self.dqn.parameters()) self.lr = 1e-4 self.num_collisions = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore): possible_actions = list(copy.copy(Action.SET)) best_action = np.argmax( self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist()) if explore or np.random.binomial(1, self.eps) == 1: best_action = possible_actions[np.random.choice( len(possible_actions))] self.action_traj.append(best_action) return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) self.rewards += [reward] self.states = states for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) < self.collision_distance: self.num_collisions += 1 break self.cum_rewards += reward def learn_from_buffer(self): self._value_func_estimate() def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.action_traj = [] self.states = states self.rewards = [] def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state
def __init__(self, embedding_size=64, message_size=128, msg_fn_layers=2, merge_fn_extra_layers=2, num_passes=1, edge_embedding_size=32, cuda=False): super(GNNModel, self).__init__() # set hyperparameters self.num_passes = num_passes # number of up/down passes self.embedding_size = embedding_size self.message_size = message_size self.edge_embedding_size = edge_embedding_size # set known vocab embedding -- for now also the consts self.embedding = nn.Embedding(len(GNN_TOKENS), embedding_size) # leaf scoring function for outputting self.score = MLP([embedding_size, 1]) # message functions for each class x child x direction self.msg_fn_keys = [k for Class in GNN_NODES for k in Class.msg_fn_keys()] # edge embedding for each edge type self.edge_embedding = nn.Embedding(len(self.msg_fn_keys), self.edge_embedding_size) # create mapping of msg fn keys -> index self.msg_fn_dict = {} for i, k in enumerate(self.msg_fn_keys): self.msg_fn_dict[k] = Variable(torch.LongTensor([i])) if cuda: self.msg_fn_dict[k] = self.msg_fn_dict[k].cuda() # create the message functions: msg_fn_shape = [self.embedding_size + self.edge_embedding_size] + \ [self.message_size] * (msg_fn_layers - 1) +\ [self.message_size] self.msg_fn_shared = MLP(msg_fn_shape) # merge function for each class self.merge_fn = {} for Class in GNN_NODES: if Class.nmerge > 0: layers = [self.message_size * i for i in range(Class.nmerge, 0, -1)] + \ [self.message_size] * merge_fn_extra_layers self.merge_fn[Class.name] = MergeMLP(layers) self.lvar_epsilon = torch.nn.Parameter(torch.FloatTensor([-10.0])) # gru for each class self.gru = { Class.name : nn.GRUCell( input_size=self.message_size, hidden_size=self.embedding_size, bias=True) for Class in GNN_NODES } self.lvar_epsilon = torch.nn.Parameter(torch.FloatTensor([-10.0])) # add modules in msgfn, mergefn, gru manually for k, module in self.gru.items(): self.add_module("gru_%s" % k, module) for k, module in self.merge_fn.items(): self.add_module("merge_%s" % k, module) self._cuda = cuda if self._cuda: self.cuda()
class SupervisedAgent(Agent): def __init__( self, feed_units: List[int], agent_name: str, model_dims: List[int] = [20], lr: float = 1e-3, boltzmann: bool = False, epsilon: float = 0.05, batch_size: int = 128, ): self.feed_units = copy.deepcopy(feed_units) self.agent_name = agent_name self.interest_level = 0 self.cum_rewards: float = 0. self.num_features: int = len(feed_units) self.training_data = [] self.buffer: SupervisedMemory = SupervisedMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [2] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.boltzmann: bool = boltzmann self.epsilon: float = epsilon self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_unit_indices: List[int] = [] self.latest_feature = None self.latest_action = None self.current_feed = 0 self.cum_reward_history: List[float] = [] self.rewards: List[float] = [] self.actions = [] def choose_action(self): available_actions = [0, 1] features: List[float] = [-1. for _ in range(self.num_features)] for index in range(self.current_feed): features[index] = 0. for index in self.history_unit_indices: features[index] = 1. # base_feature.append(self.interest_level) with torch.no_grad(): outcomes = self.model( torch.tensor(features, dtype=torch.double) ) _, best_index = torch.max(outcomes, 0) best_index = best_index.item() best_action = [available_actions[best_index]] self.latest_feature = features self.latest_action = best_action if best_action[0] == 1: self.history_unit_indices.append(self.current_feed) self.current_feed += 1 if np.random.rand() < self.epsilon: return np.random.randint(2) # print(best_action) return best_action[0] def update_buffer( self, scroll: bool, reward: int, ): # print(reward) self.cum_rewards += reward self.rewards += [reward] self.training_data += [self.latest_feature] self.actions += [self.latest_action] # self.current_feed += 1 def learn_from_buffer(self): # print(self.actions) for i, data in enumerate(self.training_data): self.buffer.push( torch.tensor([data], dtype=torch.double), torch.tensor([[np.sum(self.rewards[i:])]], dtype=torch.double), torch.tensor([self.actions[i]], dtype=torch.long), ) if len(self.buffer) < self.batch_size: return loss_ensemble = 0. for _ in range(10): transitions = self.buffer.sample(self.batch_size) batch = SupervisedTransition(*zip(*transitions)) state_batch = torch.cat(batch.feature) action_batch = torch.cat(batch.actions) lifetime_value_batch = torch.cat(batch.lifetime_value) predicted_lifetime_value = self.model(state_batch).gather(1, action_batch) loss = self.loss_fn(predicted_lifetime_value, lifetime_value_batch) loss_ensemble += loss.item() self.optimizer.zero_grad() loss.backward() for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble def reset(self): self.cum_rewards: float = 0. self.interest_level = 0. self.latest_feature = None self.latest_action = None self.history_unit_indices = [] self.cum_reward_history.append(self.cum_rewards) self.current_loc = [0, 0] self.current_feed = 0 self.rewards = [] self.actions = [] self.training_data = []
class YahooSupervisedAgent(): def __init__(self, initial_feed_candidates, user_features, feed_counts: int, agent_name: str, feed_feature_count=6, user_feature_count=6, model_dims=[50, 25], batch_size: int = 128, interest_unknown: bool = False, boltzmann: bool = True): self.initial_feed_candidates = initial_feed_candidates self.current_feed_candidates = initial_feed_candidates self.user_features = user_features self.feed_counts = feed_counts self.agent_name = agent_name self.cum_rewards: float = 0. self.rewards = [] self.actions = [] self.training_data = [] self.feed_feature_count = feed_feature_count self.user_feature_count = user_feature_count self.num_features: int = feed_counts * feed_feature_count + feed_feature_count + user_feature_count self.buffer: SupervisedMemory = SupervisedMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [1] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) self.boltzmann: bool = boltzmann self.epsilon: float = 0.05 self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_actions = [] self.latest_feature = None self.current_feed = 0 self.cum_reward_history: List[float] = [] def choose_action(self): available_actions = [ candidate.features for candidate in self.current_feed_candidates ] features = np.array([-1. for _ in range(self.num_features)]) for index, action in enumerate(self.history_actions): features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action features[-self.user_feature_count:] = self.user_features candidate_features = [] for f in available_actions: candidate_feature = np.copy(features) candidate_feature[self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count] = f candidate_features.append(candidate_feature) candidate_features = np.array(candidate_features) with torch.no_grad(): outcomes = self.model( torch.tensor(candidate_features, dtype=torch.double).to(device)) _, best_index = torch.max(outcomes, 0) best_index = best_index.item() if self.boltzmann: outcomes = outcomes / 0.05 best_index = np.random.choice( len(available_actions), p=torch.nn.functional.softmax(outcomes.reshape( (len(available_actions))), dim=0).cpu().numpy()) elif np.random.rand() < 0.05: best_index = np.random.choice(len(available_actions)) best_action = self.current_feed_candidates[best_index] self.latest_feature = candidate_features[best_index] self.history_actions.append(best_action.features) self.current_feed += 1 return best_action def update_buffer(self, scroll: bool, reward: int, new_batch): self.cum_rewards += reward self.rewards += [reward] self.training_data += [self.latest_feature] self.current_feed_candidates = new_batch def learn_from_buffer(self): for i, data in enumerate(self.training_data): self.buffer.push( torch.tensor([data], dtype=torch.double).to(device), torch.tensor([[np.sum(self.rewards[i:])]], dtype=torch.double).to(device), ) if len(self.buffer) < self.batch_size: return loss_ensemble = 0. for _ in range(10): transitions = self.buffer.sample(self.batch_size) batch = SupervisedTransition(*zip(*transitions)) state_batch = torch.cat(batch.feature) lifetime_value_batch = torch.cat(batch.lifetime_value) predicted_lifetime_value = self.model(state_batch) loss = self.loss_fn(predicted_lifetime_value, lifetime_value_batch) loss_ensemble += loss.item() self.optimizer.zero_grad() loss.backward() for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble def reset(self, user_features, initial_feeds, user_embedding): self.cum_rewards: float = 0. self.interest_level = 0. self.latest_feature = None self.history_actions = [] self.rewards = [] self.actions = [] self.training_data = [] self.cum_reward_history.append(self.cum_rewards) self.current_feed = 0 self.current_feed_candidates = initial_feeds self.user_features = user_features
def __init__( self, initial_feed_candidates, user_features, feed_counts, agent_name: str, feed_feature_count = 6, user_feature_count = 6, ensemble_size: int = 10, prior_variance: float = 1.0, model_dims: List[int] = [50, 25], bootstrap: bool = True, lr: float = 1e-3, batch_size: int = 32, noise_variance = 0 ): self.initial_feed_candidates = initial_feed_candidates self.current_feed_candidates = initial_feed_candidates self.user_features = user_features self.feed_counts = feed_counts self.agent_name = agent_name self.bootstrap = bootstrap self.cum_rewards: float = 0. self.interest_level = 0. self.feed_feature_count = feed_feature_count self.user_feature_count = user_feature_count self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count self.noise_variance = noise_variance self.ensemble_size: int = ensemble_size self.training_datas = [ReplayMemory(100000) for _ in range(ensemble_size)] self.latest_feature = None self.prior_variance = prior_variance self.model_dims: List[int] = [self.num_features] + model_dims + [1] priors = [] for i in range(self.ensemble_size): priors.append(MLP(self.model_dims)) priors[i].initialize() priors[i].double() priors[i].eval() priors[i].to(device) self.models: List[DQNWithPrior] = [] for i in range(self.ensemble_size): self.models.append(DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance))) self.models[i].initialize() self.models[i].double() self.models[i].to(device) self.target_nets: List[DQNWithPrior] = [] for i in range(self.ensemble_size): self.target_nets.append(DQNWithPrior(self.model_dims, priors[i], scale=np.sqrt(self.prior_variance))) self.target_nets[i].load_state_dict(self.models[i].state_dict()) self.target_nets[i].double() self.target_nets[i].eval() self.target_nets[i].to(device) self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizers = [] for i in range(self.ensemble_size): self.optimizers.append(optim.Adam(self.models[i].parameters(), lr=lr)) self.cur_index = np.random.choice(self.ensemble_size) self.cur_net = self.target_nets[self.cur_index] self.batch_size = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_actions = [] self.cum_reward_history: List[float] = [] self.current_feed = 0
class DQNAgent(Agent): def __init__( self, feed_units: List[int], agent_name: str, model_dims: List[int] = [], lr: float = 1e-3, boltzmann: bool = False, epsilon: float = 0.05, batch_size: int = 128, ): self.feed_units = copy.deepcopy(feed_units) self.agent_name = agent_name self.interest_level = 0 self.cum_rewards: float = 0. self.num_features: int = len(feed_units) self.training_data: ReplayMemory = ReplayMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [2] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.target_net = MLP(self.model_dims).double().to(device) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.boltzmann: bool = boltzmann self.epsilon: float = epsilon self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_unit_indices: List[int] = [] self.latest_feature = None self.latest_action = None self.current_feed = 0 self.cum_reward_history: List[float] = [] self.current_loc = [0, 0] def choose_action(self): available_actions = [0, 1] features: List[float] = [-1. for _ in range(self.num_features)] for index in range(self.current_feed): features[index] = 0. for index in self.history_unit_indices: features[index] = 1. # base_feature.append(self.interest_level) with torch.no_grad(): outcomes = self.model(torch.tensor(features, dtype=torch.double)) _, best_index = torch.max(outcomes, 0) best_index = best_index.item() best_action = [available_actions[best_index]] self.latest_feature = features self.latest_action = best_action if best_action[0] == 1: self.history_unit_indices.append(self.current_feed) self.current_feed += 1 if np.random.rand() < self.epsilon: return np.random.randint(2) # print(best_action) return best_action[0] def update_buffer( self, scroll: bool, reward: int, ): # print(reward) self.cum_rewards += reward if not scroll: self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double), torch.tensor([self.latest_action], dtype=torch.long), torch.tensor([reward], dtype=torch.double), None, ) return features: List[float] = [-1. for _ in range(self.num_features)] for index in range(self.current_feed): features[index] = 0. for index in self.history_unit_indices: features[index] = 1. self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double), torch.tensor([self.latest_action], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([features], dtype=torch.double), ) def learn_from_buffer(self): if len(self.training_data) < self.batch_size: return try: loss_ensemble = 0. for i in range(0, 10): transitions = self.training_data.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat( [s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = self.model(state_batch).gather( 1, action_batch) next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double) next_state_values[non_final_mask] = self.target_net( non_final_next_states).max(1)[0].detach() expected_state_action_values = self.gamma * next_state_values + reward_batch loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1)) loss_ensemble += loss.item() self.optimizer.zero_grad() loss.backward() # for param in self.model.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizer.step() self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble self.epsilon = 0.999 * self.epsilon except: print('{}: no non-terminal state'.format(self.agent_name)) def reset(self): self.cum_rewards: float = 0. self.interest_level = 0. self.latest_feature = None self.latest_action = None self.target_net.load_state_dict(self.model.state_dict()) self.target_net.double() self.target_net.eval() self.target_net.to(device) self.history_unit_indices = [] self.cum_reward_history.append(self.cum_rewards) self.current_loc = [0, 0] self.current_feed = 0
class YahooDQNAgent(): def __init__( self, initial_feed_candidates, user_features, feed_counts, agent_name: str, feed_feature_count = 6, user_feature_count = 6, model_dims: List[int] = [50, 25], lr: float = 1e-3, boltzmann: bool = True, epsilon: float = 0.05, batch_size: int = 128, ): self.initial_feed_candidates = initial_feed_candidates self.current_feed_candidates = initial_feed_candidates self.user_features = user_features self.feed_counts = feed_counts self.agent_name = agent_name self.interest_level = 0 self.cum_rewards: float = 0. self.feed_feature_count = feed_feature_count self.user_feature_count = user_feature_count self.num_features = feed_counts * feed_feature_count + feed_feature_count + user_feature_count self.training_data: ReplayMemory = ReplayMemory(100000) self.model_dims: List[int] = [self.num_features] + model_dims + [1] self.model = MLP(self.model_dims).double() self.model.initialize() self.model.to(device) self.target_net = MLP(self.model_dims).double().to(device) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.boltzmann: bool = boltzmann self.epsilon: float = epsilon self.batch_size: int = batch_size self.gamma = 0.99 self.running_loss = 0.0 self.history_actions = [] self.latest_feature = None self.current_feed = 0 self.cum_reward_history: List[float] = [] def choose_action(self): available_actions = [candidate.features for candidate in self.current_feed_candidates] features = [-1. for _ in range(self.num_features)] for index, action in enumerate(self.history_actions): features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action features[-self.user_feature_count:] = self.user_features candidate_features = [] for f in available_actions: candidate_feature = np.copy(features) candidate_feature[ self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count ] = f candidate_features.append(candidate_feature) candidate_features = np.array(candidate_features) # base_feature.append(self.interest_level) with torch.no_grad(): outcomes = self.model( torch.tensor(candidate_features, dtype=torch.double).to(device) ) _, best_index = torch.max(outcomes, 0) best_index = best_index.item() if self.boltzmann: outcomes = outcomes / 0.05 best_index = np.random.choice( len(available_actions), p=torch.nn.functional.softmax(outcomes.reshape((len(available_actions))), dim=0).cpu().numpy() ) elif np.random.rand() < 0.05: best_index = np.random.choice(len(available_actions)) best_action = self.current_feed_candidates[best_index] self.latest_feature = candidate_features[best_index] self.history_actions.append(best_action.features) self.current_feed += 1 return best_action def update_buffer( self, scroll: bool, reward: int, new_batch ): # print(reward) self.cum_rewards += reward self.current_feed_candidates = new_batch if not scroll: self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double).to(device), torch.tensor([reward], dtype=torch.double).to(device), None, ) return available_actions = [candidate.features for candidate in self.current_feed_candidates] features: List[float] = [-1. for _ in range(self.num_features)] for index, action in enumerate(self.history_actions): features[index * self.feed_feature_count:(index + 1) * self.feed_feature_count] = action features[-self.user_feature_count:] = self.user_features candidate_features = [] for f in available_actions: candidate_feature = np.copy(features) candidate_feature[ self.feed_counts * self.feed_feature_count:(self.feed_counts + 1) * self.feed_feature_count ] = f candidate_features.append(candidate_feature) candidate_features = np.array(candidate_features) self.training_data.push( torch.tensor([self.latest_feature], dtype=torch.double).to(device), torch.tensor([reward], dtype=torch.double).to(device), torch.tensor([candidate_features], dtype=torch.double).to(device), ) def learn_from_buffer(self): if len(self.training_data) < self.batch_size: return loss_ensemble = 0. for i in range(0, 10): transitions = self.training_data.sample(self.batch_size) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) state_batch = torch.cat(batch.state) reward_batch = torch.cat(batch.reward) state_action_values = self.model(state_batch) all_none = True for s in batch.next_state: if s is not None: all_none = False next_state_values = torch.zeros(self.batch_size, device=device, dtype=torch.double) if not all_none: non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].reshape((-1)).detach() expected_state_action_values = self.gamma * next_state_values + reward_batch loss = self.loss_fn(state_action_values, expected_state_action_values.unsqueeze(1)) loss_ensemble += loss.item() self.optimizer.zero_grad() loss.backward() for param in self.model.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() self.running_loss = 0.8 * self.running_loss + 0.2 * loss_ensemble self.epsilon = 0.999 * self.epsilon def reset(self, user_features, initial_feeds, user_embedding): self.cum_rewards: float = 0. self.interest_level = 0. self.latest_feature = None self.current_feed_candidates = initial_feeds self.target_net.load_state_dict(self.model.state_dict()) self.target_net.double() self.target_net.eval() self.target_net.to(device) self.history_actions = [] self.cum_reward_history.append(self.cum_rewards) self.current_feed = 0 self.user_features = user_features
class NaiveSafeQLearningAgent(): def __init__(self, index, world_size, states, num_agents, collision_distance, collision_threshold, reward_threshold): self.index = index self.name = 'naive q agent' self.world_size = world_size self.states = states self.num_agents = num_agents self.rewards = [] self.buffer = ReplayMemory(10000) self.gamma = 0.9 self.beta = 1 self.action_traj = [] self.num_collisions = 0 self.collision_distance = collision_distance self.collision_threshold = collision_threshold self.reward_threshold = reward_threshold self.dimensions = [2, 5, 5, 4] self.dqn = MLP(self.dimensions).double() self.target = MLP(self.dimensions).double() self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.loss_fn = torch.nn.MSELoss(reduction='sum') self.optimizer = optim.RMSprop(self.dqn.parameters()) self.lr = 1e-4 self.num_collisions = 0 self.num_unsafe = 0 self.eps = 0.1 self.cum_rewards = 0 self.target_usage = 0 def choose_action(self, explore): possible_actions = list(copy.copy(Action.SET)) no_collision_ls = [1.0 for _ in Action.SET] for a in Action.SET: next_state = self._move_coordinate(self.states[self.index], a) next_state = bound_action(next_state, self.world_size[0], self.world_size[1]) for agent in range(self.num_agents): if agent == self.index: continue cur_agent_state = self.states[agent] if np.linalg.norm(cur_agent_state - next_state) > 1.0 + self.collision_distance: continue movement = np.rint(next_state - cur_agent_state) action = get_action(movement, self.world_size) if action == -1: continue no_collision_ls[a] *= 0.75 for i, l in enumerate(no_collision_ls): if l < self.collision_threshold and i in possible_actions: possible_actions.remove(i) q = self.target( torch.tensor(self.states[self.index], dtype=torch.double)).tolist() if len(possible_actions) == 0: best_action = np.argmax(no_collision_ls) self.action_traj.append(best_action) return best_action best_q = -math.inf best_action = Action.UP for action in possible_actions: if q[action] > best_q: best_q = q[action] best_action = action if explore or np.random.binomial(1, self.eps) == 1: best_action = possible_actions[np.random.choice( len(possible_actions))] self.action_traj.append(best_action) return best_action def update_buffer(self, reward, states): self.buffer.push( torch.tensor([self.states[self.index]], dtype=torch.double), torch.tensor([[self.action_traj[-1]]], dtype=torch.long), torch.tensor([reward], dtype=torch.double), torch.tensor([states[self.index]], dtype=torch.double)) self.rewards += [reward] self.states = states if reward < self.reward_threshold: self.num_unsafe += 1 for i, state in enumerate(states): if i == self.index: continue if np.linalg.norm(state - states[self.index]) < self.collision_distance: self.num_collisions += 1 break self.cum_rewards += reward def learn_from_buffer(self): self._value_func_estimate() def reset(self, states): self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() self.action_traj = [] self.states = states self.rewards = [] def _value_func_estimate(self): if len(self.buffer) < 32: return self.target_usage += 1 transitions = self.buffer.sample(32) batch = Transition(*zip(*transitions)) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) next_state_batch = torch.cat(batch.next_state) state_action_values = self.dqn(state_batch).gather(1, action_batch) next_state_values = self.target(next_state_batch).max(1)[0].detach() expected_state_action_values = (next_state_values * self.gamma) + reward_batch loss = self.loss_fn( state_action_values, expected_state_action_values.unsqueeze(1), ) self.optimizer.zero_grad() loss.backward() for param in self.dqn.parameters(): param.grad.data.clamp_(-1, 1) self.optimizer.step() if self.target_usage == 10: self.target_usage = 0 self.target.load_state_dict(self.dqn.state_dict()) self.target.eval() def _move_coordinate(self, state, action): movement = get_movement(action) return movement + state