def init_dqn(args): """Intitialises and returns the necessary objects for Deep Q-learning: Q-network, target network, replay buffer and optimizer. """ logging.info( "Initialisaling DQN with architecture {} and optimizer {}".format( args.dqn_archi, args.optimizer_agent)) if args.dqn_archi == 'mlp': q_net = DQN(args.obs_shape, args.n_actions, args) q_target = DQN(args.obs_shape, args.n_actions, args) elif args.dqn_archi == 'cnn': q_net = CnnDQN(args.obs_shape, args.n_actions, args) q_target = CnnDQN(args.obs_shape, args.n_actions, args) if args.optimizer_agent == 'RMSProp': optimizer_agent = optim.RMSprop(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) else: assert args.optimizer_agent == 'Adam' optimizer_agent = optim.Adam(q_net.parameters(), lr=args.lr_agent, weight_decay=args.lambda_agent) q_target.load_state_dict( q_net.state_dict()) # set params of q_target to be the same replay_buffer = ReplayBuffer(args.replay_buffer_size) if args.epsilon_annealing_scheme == 'linear': epsilon_schedule = LinearSchedule(schedule_timesteps=int( args.exploration_fraction * args.n_agent_steps), initial_p=args.epsilon_start, final_p=args.epsilon_stop) else: assert args.epsilon_annealing_scheme == 'exp' epsilon_schedule = ExpSchedule(decay_rate=args.epsilon_decay, final_p=args.epsilon_stop, initial_p=args.epsilon_start) return q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule
def train_DQN(env: WrapIt, Q: DQN, Q_target: DQN, optimizer: namedtuple, replay_buffer: ReplayBuffer, exploration: Schedule): """ @parameters Q: Q_target: optimizer: torch.nn.optim.Optimizer with parameters buffer: store the frame @return None """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete optimizer = optimizer.constructor(Q.parameters(), **optimizer.kwargs) num_actions = env.action_space.n num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') LOG_EVERY_N_STEPS = 10000 last_obs = env.reset(passit=True) # Q.getSummary() out_count = 0 bar = tqdm(range(ARGS.timesteps)) for t in bar: last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() if t > ARGS.startepoch: value = select_epsilon_greedy_action(Q, recent_observations, exploration, t, num_actions) action = value[0, 0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(last_idx, action, reward, done) if done: obs = env.reset() last_obs = obs # bar.set_description(f"{obs.shape} {obs.dtype}") if (t > ARGS.startepoch and t % ARGS.dqn_freq == 0 and replay_buffer.can_sample(ARGS.batchsize)): bar.set_description("backward") (obs_batch, act_batch, rew_batch, next_obs_batch, done_mask) = replay_buffer.sample(ARGS.batchsize) (obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) = TENSOR(obs_batch, act_batch, rew_batch, next_obs_batch, 1 - done_mask) (obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) = TO(obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) values = Q(obs_batch) current_Q_values = values.gather( 1, act_batch.unsqueeze(1).long()).squeeze() # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = Q_target(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values Q_target_values = rew_batch + (ARGS.gamma * next_Q_values) # Compute Bellman error bellman_error = Q_target_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 if num_param_updates % ARGS.dqn_updatefreq == 0: bar.set_description("update") Q_target.load_state_dict(Q.state_dict())
if receive.edgeTotalConnectInfo[edge] < give.edgeTotalConnectInfo[edge]: receive.edgeTotalConnectInfo[edge] = give.edgeTotalConnectInfo[edge] j.add(infomation) if feat == 2: if receive.edgeCountInfo[edge] < give.edgeCountInfo[edge]: receive.edgeCountInfo[edge] = give.edgeCountInfo[edge] j.add(infomation) for i in range(num_agent): if i != give.num and i != receive.num: receive.featureUpdate[i] = receive.featureUpdate[i].union(j) give.featureUpdate[receive.num].clear() elif give.num == receive.num: give.featureUpdate[receive.num].clear() model = DQN(nfeat=num_feature) # model.load_state_dict(torch.load(lists)) #retrain model_target = DQN(nfeat=num_feature) model_target.load_state_dict(model.state_dict()) loss_fn = nn.MSELoss() optimizer = optim.Adam(model.parameters(),lr=0.0002) # replay = namedtuple('replay',('nextnode','state','action','reward','next_state')) class Replay_buffer(): def __init__(self , buffer_size): self.buffer_size = buffer_size self.buffer = np.zeros( [buffer_size] , dtype = replay) self.index = 0 self.cur_size = 0 def push(self,experience): self.buffer[self.index] = experience self.index = (self.index+1)%self.buffer_size if self.cur_size < self.buffer_size: self.cur_size += 1 def sample(self,batch_size):
class QAgent: def __init__(self, epsilon_start, epsilon_end, epsilon_anneal, nb_actions, learning_rate, gamma, batch_size, replay_memory_size, hidden_size, model_input_size, use_PER, use_ICM): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.epsilon_start = epsilon_start self.epsilon_end = epsilon_end self.epsilon_anneal_over_steps = epsilon_anneal self.num_actions = nb_actions self.gamma = gamma self.batch_size = batch_size self.learning_rate = learning_rate self.step_no = 0 self.policy = DQN(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.target = DQN(hidden_size=hidden_size, inputs=model_input_size, outputs=nb_actions).to(self.device) self.target.load_state_dict(self.policy.state_dict()) self.target.eval() self.hidden_size = hidden_size self.optimizer = torch.optim.AdamW(self.policy.parameters(), lr=self.learning_rate) self.use_PER = use_PER if use_PER: self.replay = Prioritized_Replay_Memory(replay_memory_size) else: self.replay = Replay_Memory(replay_memory_size) self.loss_function = torch.nn.MSELoss() self.use_ICM = use_ICM if use_ICM: self.icm = ICM(model_input_size, nb_actions) # Get the current epsilon value according to the start/end and annealing values def get_epsilon(self): eps = self.epsilon_end if self.step_no < self.epsilon_anneal_over_steps: eps = self.epsilon_start - self.step_no * \ ((self.epsilon_start - self.epsilon_end) / self.epsilon_anneal_over_steps) return eps # select an action with epsilon greedy def select_action(self, state): self.step_no += 1 if np.random.uniform() > self.get_epsilon(): with torch.no_grad(): return torch.argmax(self.policy(state)).view(1) else: return torch.tensor([random.randrange(self.num_actions)], device=self.device, dtype=torch.long) # update the model according to one step td targets def update_model(self): if self.use_PER: batch_index, batch, ImportanceSamplingWeights = self.replay.sample( self.batch_size) else: batch = self.replay.sample(self.batch_size) batch_tuple = Transition(*zip(*batch)) state = torch.stack(batch_tuple.state) action = torch.stack(batch_tuple.action) reward = torch.stack(batch_tuple.reward) next_state = torch.stack(batch_tuple.next_state) done = torch.stack(batch_tuple.done) self.optimizer.zero_grad() if self.use_ICM: self.icm.optimizer.zero_grad() forward_loss = self.icm.get_forward_loss(state, action, next_state) inverse_loss = self.icm.get_inverse_loss(state, action, next_state) icm_loss = (1 - self.icm.beta) * inverse_loss.mean( ) + self.ICM.beta * forward_loss.mean() td_estimates = self.policy(state).gather(1, action).squeeze() td_targets = reward + (1 - done.float()) * self.gamma * \ self.target(next_state).max(1)[0].detach_() if self.use_PER: loss = (torch.tensor(ImportanceSamplingWeights, device=self.device) * self.loss_function(td_estimates, td_targets) ).sum() * self.loss_function(td_estimates, td_targets) errors = td_estimates - td_targets self.replay.batch_update(batch_index, errors.data.numpy()) else: loss = self.loss_function(td_estimates, td_targets) if self.use_ICM: loss = self.icm.lambda_weight * loss + icm_loss loss.backward() for param in self.policy.parameters(): param.grad.data.clamp_(-1, 1) if self.use_ICM: self.icm.optimizer.step() self.optimizer.step() return loss.item() # set target net parameters to policy net parameters def update_target(self): self.target.load_state_dict(self.policy.state_dict()) # save model def save(self, path, name): dirname = os.path.dirname(__file__) filename = os.path.join(dirname, os.path.join(path, name + ".pt")) torch.save(self.policy.state_dict(), filename) # load a model def load(self, path): dirname = os.path.dirname(__file__) filename = os.path.join(dirname, path) self.policy.load_state_dict(torch.load(filename)) # store experience in replay memory def cache(self, state, action, reward, next_state, done): self.replay.push(state, action, reward, next_state, done)
class DQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=0.9999, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn', device='cuda:0'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.device = device self.memory = ReplayBuffer(mem_size, input_dims, n_actions) # Create policy and target DQN models self.policy = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'policy', chkpt_dir=self.chkpt_dir) self.target = DQN(self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + 'target', chkpt_dir=self.chkpt_dir) # put on correct device (GPU or CPU) self.policy.to(device) self.target.to(device) # Optimizer self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) # Loss self.loss = nn.MSELoss() def choose_action(self, observation): # Choose an action if np.random.random() > self.epsilon: state = torch.tensor([observation], dtype=torch.float).to(self.device) actions = self.policy.forward(state) action = torch.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = torch.tensor(state).to(self.device) rewards = torch.tensor(reward).to(self.device) dones = torch.tensor(done).to(self.device) actions = torch.tensor(action).to(self.device) states_ = torch.tensor(new_state).to(self.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.target.load_state_dict(self.policy.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon *= self.eps_dec def save_models(self): self.policy.save_checkpoint() def load_models(self): self.policy.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.policy.forward(states)[indices, actions] q_next = self.target.forward(states_).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.loss(q_target, q_pred).to(self.device) loss.backward() self.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
if receive.edgeCountInfo[ edge] < give.edgeCountInfo[edge]: receive.edgeCountInfo[ edge] = give.edgeCountInfo[edge] j.add(infomation) for i in range(num_agent): if i != give.num and i != receive.num: receive.featureUpdate[i] = receive.featureUpdate[ i].union(j) give.featureUpdate[receive.num].clear() elif give.num == receive.num: give.featureUpdate[receive.num].clear() model = DQN(nfeat=num_feature) model.load_state_dict(torch.load(lists)) def pick_edge(ag): X = feature_matrix(ag) output = model(torch.from_numpy(X)) outputnum = -1 outputmax = -math.inf for i in range(num_node): if output[i] >= outputmax and i in node_ALL[ ag.togonode].connected_node: outputmax = output[i] outputnum = i return outputnum
def main(test=False, checkpoint=None, device='cuda'): if not test: wandb.init(project='dqn-breakout', name='test3') memory_size = 100000 min_rb_size = 20000 sample_size = 100 lr = 0.0001 eps_min = 0.05 eps_decay = 0.99995 env_steps_before_train = 10 tgt_model_update = 5000 env = gym.make('Breakout-v0') env = FrameStackingAndResizingEnv(env, 84, 84, 4) last_observation = env.reset() model = DQN(env.observation_space.shape, env.action_space.n, lr=lr).to(device) if checkpoint is not None: model.load_state_dict(torch.load(checkpoint)) target = DQN(env.observation_space.shape, env.action_space.n).to(device) update_target_model(model, target) replay = ReplayBuffer(memory_size) steps_since_train = 0 epochs_since_tgt = 0 step_num = -1 * min_rb_size episode_rewards = [] rolling_reward = 0 tq = tqdm() while True: if test: env.render() time.sleep(0.05) tq.update(1) eps = max(eps_min, eps_decay ** (step_num)) if test: eps = 0 if random() < eps: action = env.action_space.sample() else: x = torch.Tensor(last_observation).unsqueeze(0).to(device) action = model(x).max(-1)[-1].item() observation, reward, done, info = env.step(action) rolling_reward += reward reward = reward * 0.1 replay.insert(Sarsd(last_observation, action, reward, observation, done)) last_observation = observation if done: episode_rewards.append(rolling_reward) if test: print(rolling_reward) rolling_reward = 0 observation = env.reset() steps_since_train += 1 step_num += 1 if (not test) and (replay.idx > min_rb_size) and (steps_since_train > env_steps_before_train): loss = train_step(model, replay.sample(sample_size), target, env.action_space.n, device) wandb.log( { "loss": loss.detach().cpu().item(), "eps": eps, "avg_reward": np.mean(episode_rewards) } ) episode_rewards = [] epochs_since_tgt += 1 if epochs_since_tgt > tgt_model_update: print('updating target model') update_target_model(model, target) epochs_since_tgt = 0 torch.save(target.state_dict(), f'target.model') steps_since_train = 0 env.close()
class Agent: def __init__(self): self.controller, self.target = DQN(), DQN() # For RL self.vision = VAE() if USE_CUDA: self.controller.cuda() self.target.cuda() self.vision.cuda() # Init weights based on init function self.controller.apply(init_weights) self.vision.apply(init_weights) # Load model params into target self.target.load_state_dict(self.controller.state_dict()) self.action_number = 0 # actions taken (to determine whether or not to update) # NOTE: DQN exp buffer should use embeddings generated by vision module # The vision module (aka the VAE) has memory consisting of game states self.exp_buffer = [] # exp buffer self.exp_number = 0 # size of exp buffer so far self.opt = torch.optim.Adam(self.controller.parameters(),lr=DQN_LEARNING_RATE) self.loss = nn.SmoothL1Loss() # Make an action given a state def act(self, state, explore=True): self.action_number += 1 # Update target if self.action_number % TARGET_INTERVAL == 0: self.target.load_state_dict(self.model.state_dict()) if explore and np.random.rand() <= EPSILON: # Act randomly a = np.random.randint(NUM_ACTIONS) return a # Send state to model a_vec = self.controller(self.vision.encode(state)) a = int(torch.argmax(torch.squeeze(a_vec))) return a def load_params(self): # Looks in current directory for params for model and for VAE if LOAD_CHECKPOINT_VAE: try: self.vision.load_state_dict(torch.load("VAEparams.pt")) print("Loaded checkpoint for VAE") except: print("Could not load VAE checkpoint") if LOAD_CHECKPOINT_DQN: try: self.controller.load_state_dict(torch.load("DQNparams.pt")) self.target.load_state_dict(torch.load("DQNparams.pt")) print("Loaded checkpoint for DQN") except: print("Could not load DQN checkpoint") def save_params(self): torch.save(agent.controller.state_dict(), "DQNparams.pt") torch.save(agent.vision.state_dict(), "VAEparams.pt") # clear the buffer def clear_exp_buffer(self): self.exp_buffer = [] self.exp_number = 0 self.vision.memory = [] self.vision.memory_num = 0 # Add experience to exp buffer def add_exp(self, exp): self.vision.remember(exp[0]) if self.exp_number >= EXP_BUFFER_MAX: del self.exp_buffer[0] else: self.exp_number += 1 exp[0] = self.vision.encode(exp[0]) exp[3] = self.vision.encode(exp[3]) self.exp_buffer.append(exp) # Replay gets batch and trains on it # Returns [vision loss, controller loss] def replay(self, batch_size): v_loss, q_loss = 0,0 # Init to 0 in case we need to return without any training # Train vision component first if self.action_number % VAE_UPDATE_INTERVAL == 0: v_loss = self.vision.replay() # If experience buffer isn't right size yet, don't do anything if self.exp_number < EXP_BUFFER_MIN or self.action_number % TRAINING_INTERVAL != 0: return [v_loss, q_loss] # Get batch from experience_buffer batch = random.sample(self.exp_buffer, batch_size) s,a,r,s_new,_ = zip(*batch) s_new = s_new[:-1] # Remove last # First turn batch into something we can run through model s = torch.cat(s) a = torch.LongTensor(a).unsqueeze(1) r = torch.FloatTensor(r) s_new = torch.cat(s_new) if USE_CUDA: a = a.cuda() r = r.cuda() # Get q vals for s (what model outputted) from a # .gather gets us q value for specific action a pred_q_vals = self.model(s).gather(1,a).squeeze() # Having chosen a in s, # What is the highest possible reward we can get from s_new? # We add q of performing a in s then add best q from next state # cat 0 to end for the terminal state s_new_q_vals = self.target(s_new).max(1)[0] zero = torch.zeros(1) if USE_CUDA: zero = zero.cuda() s_new_q_vals = torch.cat((s_new_q_vals, zero)) exp_q_vals = r + s_new_q_vals*GAMMA myloss = self.loss(pred_q_vals, exp_q_vals) self.opt.zero_grad() myloss.backward() if WEIGHT_CLIPPING: for param in self.model.parameters(): param.grad.data.clamp_(-1,1) # Weight clipping avoids exploding gradients self.opt.step() global EPSILON if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY return [v_loss, myloss.item()]
args.eps_start = 0.0 args.eps_end = 0.0 args.eps_steps = 1 policy = EpsGreedyPolicy(args.eps_start, args.eps_end, args.eps_steps) opt_step = 0 # pre-training if not args.no_train: print('Pre-training') for i in range(1000): opt_step += 1 optimize_dqfd(args.bsz, 1.0, opt_step) if i % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) print('Pre-training done') else: args.demo_prop = 0 env = MyEnv() env.reset() # training loop ep_counter = count(1) if args.num_eps < 0 else range(args.num_eps) for i_episode in ep_counter: state = env.reset() total_reward = 0 transitions = [] q_vals = policy_net(state) for step_n in count():
def main(test=False, checkpoint=None, device='cuda', project_name='dqn', run_name='example'): if not test: wandb.init(project=project_name, name=run_name) ## HYPERPARAMETERS memory_size = 500000 min_rb_size = 50000 sample_size = 64 lr = 0.0001 boltzmann_exploration = False eps_min = 0.05 eps_decay = 0.999995 train_interval = 4 update_interval = 10000 test_interval = 5000 episode_reward = 0 episode_rewards = [] screen_flicker_probability = 0.5 # additional hparams living_reward = -0.01 same_frame_ctr = 0 same_frame_limit = 200 # replay buffer replay = ReplayBuffer(memory_size) step_num = -1 * min_rb_size # environment creation env = gym.make('BreakoutDeterministic-v4') env = BreakoutFrameStackingEnv(env, 84, 84, 4) test_env = gym.make('BreakoutDeterministic-v4') test_env = BreakoutFrameStackingEnv(test_env, 84, 84, 4) last_observation = env.reset() # model creation model = DQN(env.observation_space.shape, env.action_space.n, lr=lr).to(device) if checkpoint is not None: model.load_state_dict(torch.load(checkpoint)) target = DQN(env.observation_space.shape, env.action_space.n).to(device) update_target_model(model, target) # training loop tq = tqdm() while True: if test: env.render() time.sleep(0.05) tq.update(1) eps = max(eps_min, eps_decay**(step_num)) if test: eps = 0 if boltzmann_exploration: x = torch.Tensor(last_observation).unsqueeze(0).to(device) logits = model(x) action = torch.distributions.Categorical( logits=logits[0]).sample().item() else: # epsilon-greedy if random() < eps: action = env.action_space.sample() else: x = torch.Tensor(last_observation).unsqueeze(0).to(device) qvals = model(x) action = qvals.max(-1)[-1].item() # screen flickering # if random() < screen_flicker_probability: # last_observation = np.zeros_like(last_observation) # observe and obtain reward observation, reward, done, info = env.step(action) episode_reward += reward # add to replay buffer replay.insert( Sarsd(last_observation, action, reward, observation, done)) last_observation = observation # episode end logic if done: episode_rewards.append(episode_reward) if len(episode_rewards) > 100: del episode_rewards[0] wandb.log({ "reward_ep": episode_reward, "avg_reward_100ep": np.mean(episode_rewards) }) episode_reward = 0 last_observation = env.reset() step_num += 1 # testing, model updating and checkpointing if (not test) and (replay.idx > min_rb_size): if step_num % train_interval == 0: loss = train_step(model, replay.sample(sample_size), target, env.action_space.n, device) wandb.log({ "loss": loss.detach().cpu().item(), "step": step_num }) if not boltzmann_exploration: wandb.log({"eps": eps}) if step_num % update_interval == 0: print('updating target model') update_target_model(model, target) torch.save(target.state_dict(), f'target.model') model_artifact = wandb.Artifact("model_checkpoint", type="raw_data") model_artifact.add_file('target.model') wandb.log_artifact(model_artifact) if step_num % test_interval == 0: print('running test') avg_reward, best_reward, frames = policy_evaluation( model, test_env, device) # model or target? wandb.log({ 'test_avg_reward': avg_reward, 'test_best_reward': best_reward, 'test_best_video': wandb.Video(frames.transpose(0, 3, 1, 2), str(best_reward), fps=24) }) env.close()
def val_dist(): file = open(thefile, 'r', encoding='UTF-8') line = file.readlines() num_node = int(line[0]) num_edge = int(line[1]) num_agent = int(line[num_node + num_edge + 2]) constraint = int(line[num_node + num_edge + num_agent + 3]) maxspeed = 0 Cost = 0 # lists = "Model\_3f_dist_no" # lists = "Model\_3f_dist_1" lists = "Model\_3f_dist_2" class Node: def __init__(self, pos, number): self.pos = pos self.number = number self.connected_node = [] self.in_commu_range = [] #溝通範圍(constraint)內的node self.all_ag_here = [] #在這個node上的agent class Edge: def __init__(self, distance, number): self.ox = 'x' self.distance = distance self.number = number self.count = 0 class Agent: def __init__(self, cur, speed, number): self.currnode_ori = cur self.currnode = cur self.togonode = cur self.lastedge = 0 self.togoedge = 0 self.curedge_length = 0 self.step = 0 self.speed = speed self.cost = 0 self.num = number self.historyaction = [] self.reward = 0 self.start = cur self.edgeLengthInfo = [] self.alreadyVisitInfo = [] self.edgeTotalConnectMap = [[0] * num_edge for i in range(num_edge)] self.edgeTotalConnectInfo = [] self.totalAgentMap = [[0] * 2 for i in range(num_edge)] self.totalAgentInfo = [] self.edgeCountInfo = [] for i in range(num_edge): self.edgeLengthInfo.append(0) self.alreadyVisitInfo.append(0) self.edgeTotalConnectInfo.append(0) self.totalAgentInfo.append(0) self.edgeCountInfo.append(0) self.featureUpdate = [] for i in range(num_agent): j = set() self.featureUpdate.append(j) node_ALL = [] edge_ALL = {} agent_ALL = [] for i in range(num_node): k = i + 2 line[k] = line[k].split() for j in range(len(line[k])): line[k][j] = int(line[k][j]) l = Node((line[k][1], line[k][2]), line[k][0]) node_ALL.append(l) for i in range(num_edge): k = num_node + i + 2 line[k] = line[k].split() for j in range(len(line[k])): line[k][j] = int(line[k][j]) l = Edge(line[k][2], i) line[k].pop() edge_ALL[tuple(line[k])] = l start = line[k][0] end = line[k][1] node_ALL[start].connected_node.append(end) node_ALL[end].connected_node.append(start) for i in range(num_agent): k = num_node + num_edge + i + 3 line[k] = line[k].split() for j in range(len(line[k])): line[k][j] = int(line[k][j]) l = Agent(int(line[k][1]), int(line[k][2]), int(line[k][0])) agent_ALL.append(l) if (maxspeed < int(line[k][2])): maxspeed = int(line[k][2]) node_ALL[l.currnode].all_ag_here.append(i) #算哪些node在溝通範圍(constraint)內 def cal_dis(a, b): return np.sqrt( np.square(abs(a.pos[0] - b.pos[0])) + np.square(abs(a.pos[1] - b.pos[1]))) for i in range(num_node): for j in range(num_node): if (cal_dis(node_ALL[i], node_ALL[j]) <= constraint): node_ALL[i].in_commu_range.append(j) def find_edge(a, b): if tuple([a, b]) in edge_ALL: return tuple([a, b]) else: return tuple([b, a]) # 特徵矩陣 (todo) num_feature = 3 def feature_matrix(ag): X = np.zeros((num_node, num_feature)) for k in node_ALL[ag.currnode].connected_node: ed = edge_ALL[find_edge(ag.currnode, k)].number # 距離 if ag.edgeLengthInfo[ed] != 0: X[k][0] = ag.edgeLengthInfo[ed] # # 被幾個edge走到 X[k][1] = ag.edgeTotalConnectInfo[ed] # 此edge被走過幾次 X[k][2] = ag.edgeCountInfo[ed] X = np.around((X), decimals=3) return X def update_info(): for u in range(num_agent): for give in agent_ALL: for receive in agent_ALL: if receive.currnode in node_ALL[ give. currnode].in_commu_range and give.num != receive.num: j = set() for infomation in set(give.featureUpdate[receive.num]): feat, edge = infomation if feat == 0: if receive.edgeLengthInfo[edge] == 0: receive.edgeLengthInfo[ edge] = give.edgeLengthInfo[edge] j.add(infomation) if feat == 1: if receive.edgeTotalConnectInfo[ edge] < give.edgeTotalConnectInfo[edge]: receive.edgeTotalConnectInfo[ edge] = give.edgeTotalConnectInfo[edge] j.add(infomation) if feat == 2: if receive.edgeCountInfo[ edge] < give.edgeCountInfo[edge]: receive.edgeCountInfo[ edge] = give.edgeCountInfo[edge] j.add(infomation) for i in range(num_agent): if i != give.num and i != receive.num: receive.featureUpdate[ i] = receive.featureUpdate[i].union(j) give.featureUpdate[receive.num].clear() elif give.num == receive.num: give.featureUpdate[receive.num].clear() model = DQN(nfeat=num_feature) model.load_state_dict(torch.load(lists)) def pick_edge(ag): X = feature_matrix(ag) output = model(torch.from_numpy(X)) outputnum = -1 outputmax = -math.inf for i in range(num_node): if output[i] >= outputmax and i in node_ALL[ ag.togonode].connected_node: outputmax = output[i] outputnum = i return outputnum def walking(ag): if ag.currnode_ori != ag.togonode: edge_ALL[find_edge(ag.currnode_ori, ag.togonode)].ox = 'o' ag.edgeLengthInfo[edge_ALL[ag.togoedge].number] = ag.curedge_length ag.alreadyVisitInfo[edge_ALL[ag.togoedge].number] = 1 for i in range(num_agent): ag.featureUpdate[i].add( tuple([0, edge_ALL[ag.togoedge].number])) ag.currnode = ag.togonode ag.currnode_ori = ag.togonode ag.lastedge = ag.togoedge ag.historyaction.append(ag.togonode) ag.step = ag.step - ag.curedge_length ag.togonode = pick_edge(ag) togo_edge = find_edge(ag.currnode, ag.togonode) ag.curedge_length = edge_ALL[togo_edge].distance ag.togoedge = togo_edge if ag.lastedge != ag.togoedge and ag.lastedge != 0: head = edge_ALL[ag.lastedge].number tail = edge_ALL[ag.togoedge].number ag.edgeTotalConnectMap[head][tail] = 1 ag.edgeTotalConnectMap[tail][head] = 1 ag.edgeTotalConnectInfo[head] = sum(ag.edgeTotalConnectMap[head]) ag.edgeTotalConnectInfo[tail] = sum(ag.edgeTotalConnectMap[tail]) for i in range(num_agent): ag.featureUpdate[i].add(tuple([1, head])) ag.featureUpdate[i].add(tuple([1, tail])) edge_ALL[ag.togoedge].count += 1 ag.edgeCountInfo[edge_ALL[ag.togoedge].number] = edge_ALL[ ag.togoedge].count for i in range(num_agent): ag.featureUpdate[i].add(tuple([2, edge_ALL[ag.togoedge].number])) k = 100000 while not all(edge_ALL[r].ox == 'o' for r in edge_ALL): for ag in agent_ALL: ag.step += ag.speed ag.cost += ag.speed while ag.curedge_length <= ag.step: update_info() node_ALL[ag.currnode].all_ag_here.remove(ag.num) walking(ag) node_ALL[ag.currnode].all_ag_here.append(ag.num) if ag.step > ag.curedge_length / 2: node_ALL[ag.currnode].all_ag_here.remove(ag.num) ag.currnode = ag.togonode node_ALL[ag.currnode].all_ag_here.append(ag.num) update_info() Cost += maxspeed if Cost > k: print(Cost) k += 100000 # Write all action to file fileforHistoryaction = "Animation/RL_dist" + str(num_node) + ".txt" f = open(fileforHistoryaction, "w") print(num_node, file=f) for i in agent_ALL: print(i.historyaction, file=f) print("Model_Dist = ", Cost) thecost[4] += Cost
class FixedDQNAgent(DQNAgent): """ DQN Agent with a target network to compute Q-targets. Extends DQNAgent. """ def __init__(self, input_dim, output_dim, lr, gamma, max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, target_update=100, linear1_units=64, linear2_units=64, decay_type="linear"): super().__init__(input_dim, output_dim, lr, gamma, max_memory_size, batch_size, eps_start, eps_end, eps_decay, device, linear1_units, linear2_units, decay_type) self.model_name = "FixedDQN" self.target_update_freq = target_update # networks self.output_dim = output_dim self.target_net = DQN(input_dim, output_dim, linear1_units, linear2_units).to(device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.updated = 0 def learn(self): """ Update the weights of the network, using target_net to compute Q-targets. Every self.target_update_freq updates, clone the policy_net. :return: the loss """ states, next_states, actions, rewards, dones = self.memory.sample( self.batch_size) curr_q_vals = self.policy_net(states).gather(1, actions) next_q_vals = self.target_net(next_states).max( 1, keepdim=True)[0].detach() target = (rewards + self.gamma * next_q_vals * (1 - dones)).to( self.device) loss = F.smooth_l1_loss(curr_q_vals, target) self.optim.zero_grad() loss.backward() self.optim.step() self.updated += 1 if self.updated % self.target_update_freq == 0: self.target_hard_update() return loss.item() def target_hard_update(self): """ Clone the policy net weights into the target net """ self.target_net.load_state_dict(self.policy_net.state_dict())
# Compute current Q value, q_func takes only state and output value for every state-action pair # We choose Q based on action taken. current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).view(-1) # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = target_Q(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = rew_batch + (GAMMA * next_Q_values) # Compute Bellman error bellman_error = target_Q_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 # Periodically update the target network by Q network to target Q network if num_param_updates % TARGERT_UPDATE_FREQ == 0: target_Q.load_state_dict(Q.state_dict()) print(np.mean(episodes_rewards))