def train_convolutional_part(self, env, n_frames, print_state_every=100): self.current_model.mode_enc_dec = True # Take a random action action = self.current_model.act(state=None, epsilon=1.) state = env.reset() states_buffer = ReplayBuffer(capacity=1000) losses = [] for i in range(n_frames): next_state, reward, done, _ = env.step(action) states_buffer.push(state, action, reward, next_state, done) if n_frames % 4 == 0: action = self.current_model.act(state=None, epsilon=1.) if done: print("Episode done during Encoder Decoder Training") state = env.reset() if len(states_buffer) > self.batch_size: # Train loss = self.compute_conv_loss( states_buffer.state_sample(batch_size=self.batch_size)) # Save the loss losses.append(loss.item()) if i % print_state_every == 0 and len(losses) > 1: print("Training Encoder Decoder. Step:" + str(i) + "/" + str(n_frames) + ". " "Mean Loss: " + str(np.round(np.mean(losses[-10:]), decimals=5))) for param in self.current_model.encoder.parameters(): param.requires_grad = False self.current_model.mode_enc_dec = False self.update_target()
def __init__(self, gamma, epsilon, lr, n_actions=, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, chkpt_dir='tmp/dueling_ddqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = Network(self.lr, self.n_actions, input_dims=self.input_dims, name='lunar_lander_dueling_ddqn_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = Network(self.lr, self.n_actions, input_dims=self.input_dims, name='lunar_lander_dueling_ddqn_q_next', chkpt_dir=self.chkpt_dir)
def main(): policy_net = DQN(U_num, num_actions).to(device) #初始化Q网络 policy_net.apply(init_weights) if pretrained: ckp = torch.load('/data2/jiangjigang/ckp/dqn.pth') policy_net.load_state_dict( {k.replace('module.', ''): v for k, v in ckp.items()}) target_net = DQN(U_num, num_actions).to(device) #初始化target_Q网络 target_net.load_state_dict(policy_net.state_dict()) #用Q网络的参数初始化target_Q网络 target_net.eval() optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=learning_rate) #定义优化器Adam,可以更换 buffer = ReplayBuffer( buffer_size ) #定义一个经验池 PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中 criterion = torch.nn.MSELoss(reduction='sum') # training for i_episode in range(num_episodes): state0 = [user_loc, user_dis, node_loc, use_buff] #获得一个初始化状态 error = 0.0 all_reward = 0 for t in count(): # 选择动作 action = e_greedy_select_action(state0, policy_net) a = np.array([action.data.cpu().numpy()]) #print("action selected by e_greedy is {}".format(action)) # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况 state1, done, flag = transition_function(state0, action) # 利用奖励函数,获得当前的奖励值 reward, cost_migration = reward_function(state0, action, state1, flag) all_reward = all_reward + reward # 将经验数据存储至buffer中 buffer.add(state0, a, reward, state1, done) # exit an episode after MAX_T steps if t > MAX_T: break #当episode>10时进行网络参数更新,目的是为了让经验池中有较多的数据,使得训练较为稳定。 if i_episode > 1: # 从buffer中取出一批训练样本,训练数据batch由BATCH_SIZE参数决定 batch = buffer.getBatch(BATCH_SIZE) policy_net, target_net, bellman_error = optimize_model( batch, policy_net, target_net, optimizer_policy, criterion) error = error + bellman_error.data.cpu().numpy() # 进入下一状态 state0 = state1 ave_error = error / (t * 1.00) ave_reward = all_reward / (t * 1.00) print(ave_error, ave_reward) torch.save(policy_net.state_dict(), '/data2/jiangjigang/ckp/dqn.pth')
def __init__(self, input_size, num_actions, gamma=DEFAULT_GAMMA, buffer_size=DEFAULT_BUFFER_SIZE, batch_size=DEFAULT_BATCH_SIZE, load_from_path=None, prepare_conv=False): """ Include the double network and is in charge of train and manage it :param input_size: :param num_actions: :param buffer_size: int. Size of the replay buffer :param batch_size: int. Size of the Batch """ # Instantiate both models net = Raimbow if len(input_size) == 3 else DQN self.current_model = net(input_size=input_size, num_actions=num_actions, prepare_decoder=prepare_conv) if load_from_path is not None: self.load_weights(path=load_from_path) self.target_model = net(input_size=input_size, num_actions=num_actions, prepare_decoder=prepare_conv) # Put them into the GPU if available if USE_CUDA: self.current_model = self.current_model.cuda() self.target_model = self.target_model.cuda() # Initialize the Adam optimizer and the replay buffer self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.current_model.parameters()), lr=0.00001) self.replay_buffer = ReplayBuffer(capacity=buffer_size) # Make both networks start with the same weights self.update_target() # Save the rest of parameters self.batch_size = batch_size self.gamma = gamma self.input_channels = input_size
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Reward monitoring self.best_total_reward = -np.inf # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, env, config): """Initialize an Agent object. Params ====== env : environment to be handled config : configuration given a variety of parameters """ self.env = env self.config = config # self.seed = (config['seed']) # set parameter for ML self.set_parameters(config) # Replay memory self.memory = ReplayBuffer(config) # Q-Network self.create_agents(config) # load agent if self.load_model: self.load_agent('trained_tennis_2k86.pth')
def train_DQN(env: WrapIt, Q: DQN, Q_target: DQN, optimizer: namedtuple, replay_buffer: ReplayBuffer, exploration: Schedule): """ @parameters Q: Q_target: optimizer: torch.nn.optim.Optimizer with parameters buffer: store the frame @return None """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete optimizer = optimizer.constructor(Q.parameters(), **optimizer.kwargs) num_actions = env.action_space.n num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') LOG_EVERY_N_STEPS = 10000 last_obs = env.reset(passit=True) # Q.getSummary() out_count = 0 bar = tqdm(range(ARGS.timesteps)) for t in bar: last_idx = replay_buffer.store_frame(last_obs) recent_observations = replay_buffer.encode_recent_observation() if t > ARGS.startepoch: value = select_epsilon_greedy_action(Q, recent_observations, exploration, t, num_actions) action = value[0, 0] else: action = random.randrange(num_actions) obs, reward, done, _ = env.step(action) reward = max(-1.0, min(reward, 1.0)) replay_buffer.store_effect(last_idx, action, reward, done) if done: obs = env.reset() last_obs = obs # bar.set_description(f"{obs.shape} {obs.dtype}") if (t > ARGS.startepoch and t % ARGS.dqn_freq == 0 and replay_buffer.can_sample(ARGS.batchsize)): bar.set_description("backward") (obs_batch, act_batch, rew_batch, next_obs_batch, done_mask) = replay_buffer.sample(ARGS.batchsize) (obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) = TENSOR(obs_batch, act_batch, rew_batch, next_obs_batch, 1 - done_mask) (obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) = TO(obs_batch, act_batch, rew_batch, next_obs_batch, not_done_mask) values = Q(obs_batch) current_Q_values = values.gather( 1, act_batch.unsqueeze(1).long()).squeeze() # Compute next Q value based on which action gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = Q_target(next_obs_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values Q_target_values = rew_batch + (ARGS.gamma * next_Q_values) # Compute Bellman error bellman_error = Q_target_values - current_Q_values # clip the bellman error between [-1 , 1] clipped_bellman_error = bellman_error.clamp(-1, 1) # Note: clipped_bellman_delta * -1 will be right gradient d_error = clipped_bellman_error * -1.0 # Clear previous gradients before backward pass optimizer.zero_grad() # run backward pass # current_Q_values.backward(d_error.data.unsqueeze(1)) current_Q_values.backward(d_error.data) # Perfom the update optimizer.step() num_param_updates += 1 if num_param_updates % ARGS.dqn_updatefreq == 0: bar.set_description("update") Q_target.load_state_dict(Q.state_dict())
import gym import world import utils from Buffer import ReplayBuffer from models import DQN from world import Print, ARGS from wrapper import WrapIt from procedure import train_DQN # ------------------------------------------------ env = gym.make('RiverraidNoFrameskip-v4') env = WrapIt(env) Print('ENV action', env.unwrapped.get_action_meanings()) Print('ENV observation', f"Image: {ARGS.imgDIM} X {ARGS.imgDIM} X {1}" ) # we assert to use gray image # ------------------------------------------------ Optimizer = utils.getOptimizer() schedule = utils.LinearSchedule(1000000, 0.1) Game_buffer = ReplayBuffer(ARGS.buffersize, ARGS.framelen) Q = utils.init_model(env, DQN).train().to(world.DEVICE) Q_target = utils.init_model(env, DQN).eval().to(world.DEVICE) # ------------------------------------------------ train_DQN(env, Q=Q, Q_target=Q_target, optimizer=Optimizer, replay_buffer=Game_buffer, exploration=schedule)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Reward monitoring self.best_total_reward = -np.inf # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.total_reward = 0.0 #self.count = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward if self.total_reward > self.best_total_reward: self.best_total_reward = self.total_reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): def __init__(self, gamma, epsilon, lr, n_actions=, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, chkpt_dir='tmp/dueling_ddqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = Network(self.lr, self.n_actions, input_dims=self.input_dims, name='lunar_lander_dueling_ddqn_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = Network(self.lr, self.n_actions, input_dims=self.input_dims, name='lunar_lander_dueling_ddqn_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = torch.tensor([observation],dtype=torch.float).to(self.q_eval.device) _, advantage = self.q_eval.forward(state) action = torch.argmax(advantage).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = torch.tensor(state).to(self.q_eval.device) rewards = torch.tensor(reward).to(self.q_eval.device) dones = torch.tensor(done).to(self.q_eval.device) actions = torch.tensor(action).to(self.q_eval.device) states_ = torch.tensor(new_state).to(self.q_eval.device) indices = np.arange(self.batch_size) V_s, A_s = self.q_eval.forward(states) V_s_, A_s_ = self.q_next.forward(states_) V_s_eval, A_s_eval = self.q_eval.forward(states_) q_pred = torch.add(V_s, (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] q_next = torch.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True))) q_eval = torch.add(V_s_eval, (A_s_eval - A_s_eval.mean(dim=1,keepdim=True))) max_actions = torch.argmax(q_eval, dim=1) q_next[dones] = 0.0 q_target = rewards + self.gamma*q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class Network: def __init__(self, input_size, num_actions, gamma=DEFAULT_GAMMA, buffer_size=DEFAULT_BUFFER_SIZE, batch_size=DEFAULT_BATCH_SIZE, load_from_path=None, prepare_conv=False): """ Include the double network and is in charge of train and manage it :param input_size: :param num_actions: :param buffer_size: int. Size of the replay buffer :param batch_size: int. Size of the Batch """ # Instantiate both models net = Raimbow if len(input_size) == 3 else DQN self.current_model = net(input_size=input_size, num_actions=num_actions, prepare_decoder=prepare_conv) if load_from_path is not None: self.load_weights(path=load_from_path) self.target_model = net(input_size=input_size, num_actions=num_actions, prepare_decoder=prepare_conv) # Put them into the GPU if available if USE_CUDA: self.current_model = self.current_model.cuda() self.target_model = self.target_model.cuda() # Initialize the Adam optimizer and the replay buffer self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.current_model.parameters()), lr=0.00001) self.replay_buffer = ReplayBuffer(capacity=buffer_size) # Make both networks start with the same weights self.update_target() # Save the rest of parameters self.batch_size = batch_size self.gamma = gamma self.input_channels = input_size def get_action(self, state): return self.current_model.act(state, epsilon=0.) def update_target(self): """ Updates the target model with the weights of the current model """ self.target_model.load_state_dict(self.current_model.state_dict()) def compute_td_loss(self, samples): """ Compute the loss of batch size samples of the buffer, and train the current model network with that loss :param samples: tuple of samples. Samples must have the format (state, action, reward, next_state, done) :return: float. Loss computed at this learning step """ # Take N playing samples state, action, reward, next_state, done = samples # Transform them into torch variables, for being used on GPU during the training state = Variable(torch.FloatTensor(np.float32(state))) next_state = Variable(torch.FloatTensor(np.float32(next_state))) action = Variable(torch.LongTensor(action)) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) # Get the q value of this state and all the q values of the following step q_value = self.current_model(state).gather( 1, action.unsqueeze(1)).squeeze(1) next_q_values = self.current_model(next_state) # Get the q values of the following step following the static policy of the target model next_q_state_values = self.target_model(next_state) # For all the q_values of the next state get the one of the action which would be selected by the static policy next_q_value = next_q_state_values.gather( 1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1) # Calculate the expected q value as the inmediate reward plus gamma by the expected reward at t+1 (if not ended) expected_q_value = reward + self.gamma * next_q_value * (1 - done) # Calculate the Mean Square Error loss = nn.functional.smooth_l1_loss(q_value, Variable(expected_q_value.data)) # Backpropagates the loss self.optimizer.zero_grad() loss.backward() # Learn self.optimizer.step() # Return the loss of this step return loss def compute_conv_loss(self, frames): """ Compute the loss of batch size samples of the buffer, and train the current model network with that loss :param samples: tuple of samples. Samples must have the format (state, action, reward, next_state, done) :return: float. Loss computed at this learning step """ # Transform them into torch variables, for being used on GPU during the training state = Variable(torch.FloatTensor(frames), requires_grad=True) loss = (state - self.current_model.forward(state)).pow(2).mean() # Backpropagates the loss self.optimizer.zero_grad() loss.backward() # Learn self.optimizer.step() # Return the loss of this step return loss def train_convolutional_part(self, env, n_frames, print_state_every=100): self.current_model.mode_enc_dec = True # Take a random action action = self.current_model.act(state=None, epsilon=1.) state = env.reset() states_buffer = ReplayBuffer(capacity=1000) losses = [] for i in range(n_frames): next_state, reward, done, _ = env.step(action) states_buffer.push(state, action, reward, next_state, done) if n_frames % 4 == 0: action = self.current_model.act(state=None, epsilon=1.) if done: print("Episode done during Encoder Decoder Training") state = env.reset() if len(states_buffer) > self.batch_size: # Train loss = self.compute_conv_loss( states_buffer.state_sample(batch_size=self.batch_size)) # Save the loss losses.append(loss.item()) if i % print_state_every == 0 and len(losses) > 1: print("Training Encoder Decoder. Step:" + str(i) + "/" + str(n_frames) + ". " "Mean Loss: " + str(np.round(np.mean(losses[-10:]), decimals=5))) for param in self.current_model.encoder.parameters(): param.requires_grad = False self.current_model.mode_enc_dec = False self.update_target() def epsilon_by_frame(self, frame_idx, epsilon_start=EPSILON_START, epsilon_final=EPSILON_FINAL, epsilon_decay=EPSILON_DECAY): """ Gets the epsilon of the current frame for the given parameters :param frame_idx: int. Index of the frame :param epsilon_start: float. Epsilon at frame 1 :param epsilon_final: float. Minimum epsilon for maintaining exploration :param epsilon_decay: int. Manages how fast the epsilon decays :return: Epsilon for the frame frame_idx """ return epsilon_final + (epsilon_start - epsilon_final) * math.exp( -1. * frame_idx / epsilon_decay) def train(self, env, num_frames=DEFAULT_NUM_FRAMES, DQN_update_ratio=DEFAULT_DQN_UPDATE_RATIO, plotting_path=None, verbose=True, videos_to_save=DEFAULT_VIDEOS_TO_SAVE, train_conv_first=True, show=False): """ Train the network in the given environment for an amount of frames :param env: :param num_frames: :return: """ if train_conv_first: self.train_convolutional_part(env=env, n_frames=CONV_TRAINING_FRAMES) # Save the losses of the network and the rewards of each episode losses, all_rewards = [], [] episode_reward = 0 # Reset the game for starting the game from 0 state = env.reset() actions_taken = [] for i in range(MIN_RANDOM_ACTIONS): action = self.current_model.act(state, epsilon=1.) next_state, reward, done, _ = env.step(action) self.replay_buffer.push(state, action, reward, next_state, done) state = next_state if done: env.reset() for frame_idx in range(1, num_frames + 1): # Gets an action for the current state having in account the current epsilon action = self.current_model.act( state, epsilon=self.epsilon_by_frame(frame_idx=frame_idx)) actions_taken.append(action) if show: env.render() # Execute the action, capturing the new state, the reward and if the game is ended or not next_state, reward, done, _ = env.step(action) # Save the action at the replay buffer self.replay_buffer.push(state, action, reward, next_state, done) # Update the current state and the actual episode reward state = next_state episode_reward += reward # If a game is finished save the results of that game and restart the game if done: print("Episode Reward: " + str(episode_reward) + ". " "Std of actions: " + str(np.round(np.std(actions_taken), decimals=4)) + ". " "Epsilon " + str( np.round(self.epsilon_by_frame(frame_idx=frame_idx), decimals=3))) actions_taken = [] all_rewards.append(episode_reward) state, episode_reward = env.reset(), 0 # If there are enough actions in the buffer for learning, start to learn a policy if frame_idx % ACTIONS_PER_TRAIN_STEP == 0: # Train loss = self.compute_td_loss( self.replay_buffer.sample(self.batch_size)) # Save the loss losses.append(loss.item()) if plotting_path is not None and frame_idx % PLOT_EVERY == 0: save_plot(frame_idx, all_rewards, losses, path_to_save=plotting_path) if frame_idx % DQN_update_ratio == 0: self.update_target() if verbose and frame_idx % DQN_update_ratio == 0: print( env.unwrapped.spec.id + ' Training: ' + str(frame_idx) + '/' + str(num_frames) + '. ' 'Mean Rewards: ' + str(np.round(np.mean(all_rewards[-10:]), decimals=2))) if frame_idx % (num_frames // videos_to_save) == 0: save_video(env=env, policy=self, path=os.path.join(plotting_path, VIDEOS_DIR_NAME, 'During Training', str(len(all_rewards)) + ' Games')) env.reset() def save(self, path): if not os.path.isdir(os.path.dirname(path)): os.makedirs(os.path.dirname(path)) self.current_model.cpu() torch.save(self.current_model.state_dict(), path) if USE_CUDA: self.current_model.cuda() def load_weights(self, path): if not os.path.isfile(path): warnings.warn("Trying to charge non existent weights. Skipping") else: self.current_model.cpu() output_state_dict = torch.load(path) new_dict = { key: (output_state_dict[key] if key in output_state_dict else value) for key, value in self.current_model.state_dict().items() } self.current_model.load_state_dict(new_dict) for param in self.current_model.parameters(): param.requires_grad = False for param in self.current_model.pre_output.parameters(): param.requires_grad = True for param in self.current_model.output.parameters(): param.requires_grad = True if USE_CUDA: self.current_model.cuda() return self.current_model
user_loc = np.random.randint(0, 101, U_num).tolist() #用户位置 1-100号 user_dis = random_displacement(user_loc) #用户未来位移 上下左右 -10,10,-1,1 use_buff = np.random.randint(3, 8, U_num).tolist() #资源所需 state0 = [user_loc, user_dis, node_loc, use_buff] device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #主程序部分 policy_net = DQN(U_num, num_actions).to(device) #初始化Q网络 target_net = DQN(U_num, num_actions).to(device) #初始化target_Q网络 target_net.load_state_dict(policy_net.state_dict()) #用Q网络的参数初始化target_Q网络 target_net.eval() optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=learning_rate) #定义优化器Adam,可以更换 buffer = ReplayBuffer( buffer_size) #定义一个经验池 PS:经验池储存经验数据,后随机从经验池中抽取经验数据来训练更新网络参数 在Buffer.py中 criterion = torch.nn.MSELoss(reduction='sum') # training for i_episode in range(num_episodes): #state0 #获得一个初始化状态 for t in count(): # 选择动作 action = e_greedy_select_action(state0) print("action selected by e_greedy is {}".format(action)) # 利用状态转移函数,得到当前状态下采取当前行为得到的下一个状态,和下一个状态的终止情况 state1, done, flag = transition_function(state0, action) # 利用奖励函数,获得当前的奖励值 reward, cost_migration = reward_function(state0, action, state1, flag)
class maddpg(): """Interacts with and learns from the environment.""" def __init__(self, env, config): """Initialize an Agent object. Params ====== env : environment to be handled config : configuration given a variety of parameters """ self.env = env self.config = config # self.seed = (config['seed']) # set parameter for ML self.set_parameters(config) # Replay memory self.memory = ReplayBuffer(config) # Q-Network self.create_agents(config) # load agent if self.load_model: self.load_agent('trained_tennis_2k86.pth') def set_parameters(self, config): # Base agent parameters self.gamma = config['gamma'] # discount factor self.tau = config['tau'] self.max_episodes = config[ 'max_episodes'] # max numbers of episdoes to train self.env_file_name = config[ 'env_file_name'] # name and path for env app self.brain_name = config[ 'brain_name'] # name for env brain used in step self.train_mode = config['train_mode'] self.load_model = config['load_model'] self.save_model = config['save_model'] self.num_agents = config['num_agents'] self.state_size = config['state_size'] self.action_size = config['action_size'] self.hidden_size = config['hidden_size'] self.buffer_size = config['buffer_size'] self.batch_size = config['batch_size'] self.learn_every = config['learn_every'] self.learn_num = config['learn_num'] self.critic_learning_rate = config['critic_learning_rate'] self.actor_learning_rate = config['actor_learning_rate'] self.noise_decay = config['noise_decay'] self.seed = (config['seed']) torch.manual_seed(self.seed) np.random.seed(self.seed) random.seed(self.seed) self.noise_scale = 1 self.results = struct_class() # Some Debug flags self.debug_show_memory_summary = False def create_agents(self, config): self.maddpg_agent = [ddpg_agent(config), ddpg_agent(config)] for a_i in range(self.num_agents): self.maddpg_agent[a_i].id = a_i def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward # print('Step adding types') # : ,states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape) actions = np.reshape(actions, (1, 2 * self.action_size)) self.memory.add(states, actions, rewards, next_states, dones) def act(self, state): """Returns actions for given state as per current policy shuold only get single or single joined states from train""" state = ten(state) actions = np.vstack([agent.act(state) for agent in self.maddpg_agent]) return actions def actor_target(self, states): """Returns actions for given state as per current target_policy without noise. should only get batch_size states from learn""" actions = np.hstack([agent.act(states) for agent in self.maddpg_agent]) return ten(actions) def init_results(self): """ Keeping different results in list in self.results, being initializd here""" self.results.reward_window = deque(maxlen=100) self.results.all_rewards = [] self.results.avg_rewards = [] self.results.critic_loss = [] self.results.actor_loss = [] def episode_reset(self, i_episode): self.noise_reset() self.episode = i_episode self.noise_scale *= self.noise_decay for agent in self.maddpg_agent: agent.noise_scale = self.noise_scale agent.episode = self.episode def noise_reset(self): for agent in self.maddpg_agent: agent.noise.reset() def train(self): print('Running on device : ', device) # if False: # filename = 'trained_reacher_a_e100.pth' # self.load_agent(filename) self.init_results() # training loop # show progressbar widget = [ 'episode: ', pb.Counter(), '/', str(self.max_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=self.max_episodes).start() for i_episode in range(self.max_episodes): timer.update(i_episode) tic = time.time() # per episode resets self.episode_reset(i_episode) total_reward = np.zeros(self.num_agents) # Reset the enviroment env_info = self.env.reset( train_mode=self.train_mode)[self.brain_name] states = self.get_states(env_info) t = 0 dones = np.zeros(self.num_agents, dtype=bool) # loop over episode time steps while not any(dones): # act and collect data actions = self.act(states) env_info = self.env.step(actions)[self.brain_name] next_states = self.get_states(env_info) rewards = env_info.rewards dones = env_info.local_done # increment stuff t += 1 total_reward += rewards # np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) # print('Episode {} step {} taken action {} reward {} and done is {}'.format(i_episode,t,actions,rewards,dones)) # Proceed agent step self.step(states, actions, rewards, next_states, dones) # prepare for next round states = next_states #:while not done # Learn, if enough samples are available in memory if (i_episode % self.learn_every == 0): if len(self.memory) > self.batch_size: for l in range(self.learn_num): experiences = self.memory.sample() self.learn(experiences) toc = time.time() # keep track of rewards: self.results.all_rewards.append(total_reward) self.results.avg_rewards.append(np.mean( self.results.reward_window)) self.results.reward_window.append(np.max(total_reward)) # Output Episode info : self.print_episode_info(total_reward, t, tic, toc) # for i_episode if self.save_model: filename = 'trained_tennis' + str(self.seed) + '.pth' self.save_agent(filename) return self.results def get_states(self, env_info): return np.reshape(env_info.vector_observations, (1, 2 * self.state_size)) def print_episode_info(self, total_reward, num_steps, tic, toc): if (self.episode % 20 == 0) or (np.max(total_reward) > 0.01): if np.max(total_reward) > 0.01: if np.sum(total_reward) > 0.15: if np.sum(total_reward) > 0.25: StyleString = Back.GREEN print('Double Hit') else: StyleString = Back.BLUE else: StyleString = Back.RED else: StyleString = '' print( StyleString + 'Episode {} with {} steps || Reward : {} || avg reward : {:6.3f} || Noise {:6.3f} || {:5.3f} seconds, mem : {}' .format(self.episode, num_steps, total_reward, np.mean(self.results.reward_window), self.noise_scale, toc - tic, len(self.memory))) print(Style.RESET_ALL, end='') def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples. q_target = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value """ states, actions, rewards, next_states, dones = experiences # print('Learning shape : ',states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape) # print('Learning state & reward shape : ',states[0].shape,rewards[0].shape) actor_loss = [] critic_loss = [] both_next_actions = self.actor_target(next_states) # print('Learn both',both_next_actions.shape) for agent in self.maddpg_agent: # In case of joined_states, we want actions_next from both agents for learning al, cl = agent.learn(states, actions, rewards, next_states, both_next_actions, dones) actor_loss.append(al) critic_loss.append(cl) self.results.actor_loss.append(actor_loss) self.results.critic_loss.append(critic_loss) def save_agent(self, filename): states, actions, rewards, next_states, dones = self.memory.save_buffer( ) print('save agent : ', states.shape, actions.shape, rewards.shape, next_states.shape, dones.shape) torch.save( { 'critic_local0': self.maddpg_agent[0].critic_local.state_dict(), 'critic_target0': self.maddpg_agent[0].critic_target.state_dict(), 'actor_local0': self.maddpg_agent[0].actor_local.state_dict(), 'actor_target0': self.maddpg_agent[0].actor_target.state_dict(), 'critic_local1': self.maddpg_agent[1].critic_local.state_dict(), 'critic_target1': self.maddpg_agent[1].critic_target.state_dict(), 'actor_local1': self.maddpg_agent[1].actor_local.state_dict(), 'actor_target1': self.maddpg_agent[1].actor_target.state_dict(), 'memory': (states, actions, rewards, next_states, dones), }, filename) print('Saved Networks and ER-memory in ', filename) return def load_agent(self, filename): savedata = torch.load(filename) self.maddpg_agent[0].critic_local.load_state_dict( savedata['critic_local0']) self.maddpg_agent[0].critic_target.load_state_dict( savedata['critic_target0']) self.maddpg_agent[0].actor_local.load_state_dict( savedata['actor_local0']) self.maddpg_agent[0].actor_target.load_state_dict( savedata['actor_target0']) self.maddpg_agent[1].critic_local.load_state_dict( savedata['critic_local1']) self.maddpg_agent[1].critic_target.load_state_dict( savedata['critic_target1']) self.maddpg_agent[1].actor_local.load_state_dict( savedata['actor_local1']) self.maddpg_agent[1].actor_target.load_state_dict( savedata['actor_target1']) states, actions, rewards, next_states, dones = savedata['memory'] self.memory.load_buffer(states, actions, rewards, next_states, dones) print('Memory loaded with length : ', len(self.memory)) return