def test_random_action(): env = gym.make('gym_kinova_gripper:kinovagripper-v0') obs, done = env.reset(), False noise = OUNoise(3) max_action = float(env.action_space.high[0]) correct = 0 noise.reset() cum_reward = 0.0 for i in range(100): finger_actions = noise.noise().clip(-max_action, max_action) # actions = np.array([0.0, finger_actions[0], finger_actions[1], finger_actions[2]]) actions = np.array([0.4, 0.5, 0.5, 0.5]) obs, reward, done, _ = env.step(actions) inputs = torch.FloatTensor(np.array(obs)).to(device)
def main(): my_env = env() agent = NAF_CNN(0.99, 0.001, 128, my_env.observation_space.shape[0], my_env.action_space) parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') parser.add_argument('--noise_scale', type=float, default=0.3, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.3)') parser.add_argument('--exploration_end', type=int, default=100, metavar='N', help='number of episodes with noise (default: 100)') args = parser.parse_args() ounoise = OUNoise(my_env.action_space.shape[0]) ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - 1) / args.exploration_end + args.final_noise_scale ounoise.reset() state = my_env.reset() i = 10 while i > 0: action = agent.select_action(state, ounoise) print("action: {}".format(action)) next_state, reward, done = my_env.step(action) if done: break print(reward) i = i - 1
class DDPGB(object): # x是数值向量 # b是码值向量 # c是标准codebook矩阵 # action_output_num是码值输出的维度 # replay_size是meomery队列的最大长度 # new_b代表新计算得到的b # env代表action和obersevation的产生环境 # agent代表实际的ddpg执行体 # 保留这些噪声参数只是为了能够进入到需要随机探索的部分 def __init__(self, C, b, x, action_output_num, actor_size, replay_size=1000000, ou_noise=True, param_noise=True, noise_scale=0.3, final_noise_scale=0.3): self.C = C self.b = b self.x = x self.hd = action_output_num self.actor_size = actor_size self.memory = ReplayMemory(replay_size) self.new_b = None self.env = None self.agent = None self.ou_noise = ou_noise self.noise_scale = noise_scale self.final_noise_scale = final_noise_scale self.ounoise = OUNoise(action_output_num) if ou_noise else None self.param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=noise_scale, adaptation_coefficient=1.05) if param_noise else None def update_B(self, c, b, x): self.C = c self.b = b self.x = x # 备选coff代表reward中的权重比例[0.2, 0.8] def generate_B(self, coff, gamma, tau, hidden_size, num_inputs, actor_size, num_episodes=60000, exploration_end=150, batch_size=512, updates_per_step=5000): self.env = QuantizationEnv(self.C, self.b, self.x, self.hd, coff) self.agent = DDPG(gamma, tau, hidden_size, self.env.action_bin, num_inputs, actor_size) rewards = [] total_numsteps = 0 updates = 0 max_trail = 10000 best_bb = 10000 # 开启num_episodes次最佳方案寻找 for i_episode in range(num_episodes): state = torch.Tensor([self.env.reset()]) if self.ou_noise: self.ounoise.scale = (self.noise_scale - self.final_noise_scale) * max(0, exploration_end - i_episode) \ / exploration_end + self.final_noise_scale self.ounoise.reset() if self.param_noise: self.agent.perturb_actor_parameters(self.param_noise) episode_reward = 0 continuous_neg = 0 continuous_pos = 0 temp_trail = 0 control_bit = 0 next_state = self.env.compute_Cbx(self.b) next_state = torch.Tensor([next_state]) while True: # yyj if control_bit > 15: control_bit = control_bit % 16 state = next_state action = self.agent.select_action(state, self.ounoise, self.param_noise) next_state, reward, done, bb = self.env.step( action, control_bit, self.actor_size) # print(control_bit, next_state[0], reward, done, bb) control_bit = control_bit + 1 total_numsteps += 1 episode_reward += reward # bb是c_v值 if best_bb > bb: best_bb = bb self.new_b = action if reward > 0: continuous_pos += 1 continuous_neg = 0 if continuous_pos > 10: done = True if reward < 0: continuous_neg += 1 continuous_pos = 0 if continuous_neg > 10: done = True if temp_trail > max_trail: done = True action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward]) self.memory.push(state, action, mask, next_state, reward) # state = next_state temp_trail += 1 # memorysize还不够,不会进入 if len(self.memory) > batch_size: for _ in range(updates_per_step): transitions = self.memory.sample(1) batch = Transition(*zip(*transitions)) # value_loss属于右边的网络,policy_loss属于左边的网络 value_loss, policy_loss = self.agent.update_parameters( batch) print("epoch:", i_episode, "updates", updates, "value_loss:", value_loss, " policy_loss:", policy_loss) updates += 1 if done: break if self.param_noise: episode_transitions = self.memory.memory[self.memory.position - batch_size:self. memory.position] states = torch.cat( [transition[0] for transition in episode_transitions], 0) unperturbed_actions = self.agent.select_action( states, None, None) perturbed_actions = torch.cat( [transition[1] for transition in episode_transitions], 0) ddpg_dist = ddpg_distance_metric(perturbed_actions.numpy(), unperturbed_actions.numpy()) self.param_noise.adapt(ddpg_dist) rewards.append(episode_reward) continuous_neg = 0 continuous_pos = 0 temp_trail = 0 if i_episode % 10 == 0 and i_episode != 0: state = torch.Tensor([self.env.reset()]) episode_reward = 0 control_bit = 0 while True: action = self.agent.select_action(state) next_state, reward, done, bb = self.env.step( action.numpy()[0], control_bit) episode_reward += reward if best_bb > bb: best_bb = bb self.new_b = action if reward > 0: continuous_pos += 1 continuous_neg = 0 if continuous_pos > 10: done = True if reward < 0: continuous_neg += 1 continuous_pos = 0 if continuous_neg > 10: done = True if temp_trail > max_trail: done = True next_state = torch.Tensor([next_state]) state = next_state temp_trail += 1 if done: break rewards.append(episode_reward) print( "Episode: {}, total numsteps: {}, reward: {}, average reward: {}" .format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) return self.new_b
def fit_nash(): suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode) # reward_file = open('reward' + suffix + '.txt', 'w') # attack_file = open('attacker_action' + suffix + '.txt', 'w') # weight_file = open('vehicle_weight' + suffix + '.txt', 'w') # distance_file = open('Distance' + suffix + '.txt', 'w') # reward_file.write(""" # Environment Initializing... # The initial head car velocity is {} # The initial safe distance is {} # The Nash Eq* Factor RC is {} # The Reward Calculation Mode is {} # The Attack Mode is {} # The Nash Mode is {} # """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash)) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space, 'veh') agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space, 'att') try: agent_vehicle.load_model('models/vehicle_' + suffix) print('Load vehicle RL model successfully') except: print('No existed vehicle RL model') try: agent_attacker.load_model('models/attacker_' + suffix) print('Load attacker RL model successfully') except: print('No existed attacker RL model') try: policy_vehicle = load_model('models/vehicle_' + suffix + '.h5') print('Load vehicle SL model successfully') except: policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle') try: policy_attacker = load_model('models/attacker_' + suffix + '.h5') print('Load attacker SL model successfully') except: policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker') print('*'*20, '\n\n\n') memory_vehicle = ReplayMemory(100000) memory_attacker = ReplayMemory(100000) memory_SL_vehicle = ReplayMemory(400000) memory_SL_attacker = ReplayMemory(400000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance']) reward_data = pd.DataFrame(columns=['Reward']) rewards = [] total_numsteps = 0 for i_episode in range(args.num_episodes): if i_episode % 100 == 0 and i_episode != 0: print('Writing to CSV files...') reward_data.to_csv(suffix + '.csv', index=False) res_data.to_csv(suffix + '.csv', index=False) if args.NashMode == 0: ETA = 0 elif args.NashMode == 1: ETA = 0.5 elif args.NashMode == 2: ETA = 0.1 - i_episode/args.num_episodes * 0.1 print('No.{} episode starts... ETA is {}'.format(i_episode, ETA)) # reward_file = open('reward' + suffix + '.txt', 'a') # attack_file = open('attacker_action' + suffix + '.txt', 'a') # weight_file = open('vehicle_weight' + suffix + '.txt', 'a') # distance_file = open('Distance' + suffix + '.txt', 'a') local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: sigma = random.random() if sigma > ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # 限制权重和为1 action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum()) action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker) res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}]) # 将处理的攻击值赋给原值 action_attacker = env.action_attacker total_numsteps += 1 episode_reward += reward state_record.append(next_state) local_steps += 1 episode_steps += 1 if sigma > ETA: memory_SL_vehicle.append(state_record[-1], action_vehicle) memory_SL_attacker.append(state_record[-1], action_attacker) action_vehicle = torch.Tensor(action_vehicle.reshape(1,4)) action_attacker = torch.Tensor(action_attacker.reshape(1,4)) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([RC - reward]) memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle) memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker) if done: rewards.append(episode_reward) print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) reward_data = reward_data.append([{'Reward': episode_reward}]) # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward)) break if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size: # 开始训练 for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) agent_vehicle.update_parameters(batch_vehicle) agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) if i_episode % 10 == 0 and i_episode != 0: eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance']) # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode)) state = env.reset() state_record = [np.array([state])] evaluate_reward = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0] action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum() action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2) eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}]) evaluate_reward += reward if done: print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, evaluate_reward, np.mean(rewards[-10:]))) # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode, # total_numsteps, # evaluate_reward, # np.mean(rewards[-10:]))) break # # writer.add_scalar('reward/test', episode_reward, i_episode) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() env.close() reward_data.to_csv(suffix+'_reward.csv', index=False) res_data.to_csv(suffix+'.csv', index=False) eva_res_data.to_csv(suffix+'_eva.csv', index=False) # save model agent_vehicle.save_model('vehicle_'+suffix) agent_attacker.save_model('attacker_'+suffix) policy_attacker.save('models/attacker_'+suffix+'.h5') policy_vehicle.save('models/vehicle_'+suffix+'.h5')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, num_all_agents, seed, batch_size, buffer_size=int(1e6), gamma=0.99, tau=1e-3, lr_actor=4e-4, lr_critic=4e-4, weight_decay=0, discrete_actions=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_all_agents (int): number of agents seed (int): random seed batch_size (int): minibatch size buffer_size (int): replay buffer size gamma (float): discount factor tau (float): for soft update of target parameters lr_actor (float): learning rate of the actor lr_critic (float): learning rate of the critic weight_decay (float): L2 weight decay """ random.seed(seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.num_all_agents = num_all_agents self.batch_size = batch_size self.gamma = gamma self.tau = tau self.noise = OUNoise(action_size, seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed, use_batch_norm_layers=False).to(device) self.actor_target = Actor(state_size, action_size, seed, use_batch_norm_layers=False).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) if discrete_actions: action_size = 1 self.critic_local = Critic(state_size * num_all_agents, action_size * num_all_agents, seed).to(device) self.critic_target = Critic(state_size * num_all_agents, action_size * num_all_agents, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, tau, agent_index): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states_all, actions_all, rewards_all, next_states_all, dones, actions_next_target_all, actions_next_local_all = experiences rewards_self = rewards_all[:, agent_index] states_self = states_all.view(-1, self.num_all_agents, self.state_size)[:, agent_index, :] del rewards_all # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(next_states_all, actions_next_target_all) # Compute Q targets for current states (y_i) Q_targets = rewards_self + (gamma * Q_targets_next) * (1 - dones) # Compute critic loss Q_expected = self.critic_local(states_all, actions_all) critic_loss = F.mse_loss(Q_expected.view(-1, self.batch_size), Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actor_loss = -self.critic_local(states_all, actions_next_local_all).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, tau) self.soft_update(self.actor_local, self.actor_target, tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): global subdata t_start = time.time() parser = argparse.ArgumentParser(description='PyTorch X-job') parser.add_argument('--env_name', default="OurEnv-v0", help='name of the environment') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, help='discount factor for model (default: 0.001)') parser.add_argument('--ou_noise', type=bool, default=True) parser.add_argument('--noise_scale', type=float, default=0.4, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.4)') parser.add_argument('--exploration_end', type=int, default=33, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=512, metavar='N', help='batch size (default: 512)') parser.add_argument('--num_steps', type=int, default=300, metavar='N', help='max episode length (default: 1000)') parser.add_argument('--num_episodes', type=int, default=50, metavar='N', help='number of episodes (default: 1000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='hidden size (default: 128)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--save_agent', type=bool, default=True, help='save model to file') parser.add_argument('--load_agent', type=bool, default=False, help='load model from file') parser.add_argument('--train_model', type=bool, default=True, help='Training or run') parser.add_argument('--load_exp', type=bool, default=False, help='load saved experience') parser.add_argument('--state_plot', type=bool, default=True, help='plot Q values for environment') parser.add_argument('--greedy_steps', type=int, default=5, metavar='N', help='amount of times greedy goes (default: 100)') args = parser.parse_args() #env = gym.make(args.env_name) env = Env() #env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # -- initialize agent, Q and Q' -- agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) # -- declare memory buffer and random process N memory = ReplayMemory(args.replay_size) memory_g = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None # -- load existing model -- if args.load_agent: agent.load_model(args.env_name, args.batch_size, '.pth') print("agent: naf_{}_{}_{}, is loaded").format(args.env_name, args.batch_size, '.pth') # -- load experience buffer -- if args.load_exp: with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1', 'rb') as input: memory.memory = pickle.load(input) memory.position = len(memory) #sate_Q_plot(agent, 50) rewards = [] total_numsteps = 0 greedy_reward = [] avg_greedy_reward = [] upper_reward = [] lower_reward = [] steps_to_goal = [] avg_steps_to_goal = [] state_plot = [] sim_reset_start() pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10) rospy.Subscriber("/ee_rl/state", StateMsg, callback) rate = rospy.Rate(9) rate.sleep() for i_episode in range(args.num_episodes + 1): # -- reset environment for every episode -- sim_reset() state = torch.Tensor(subdata).unsqueeze(0) # -- initialize noise (random process N) -- if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode / args.exploration_end + args.final_noise_scale) ounoise.reset() episode_reward = 0 while True: # -- action selection, observation and store transition -- action = agent.select_action( state, ounoise) if args.train_model else agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, _ = env.calc_shaped_reward(next_state) total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) memory.push(state, action, mask, next_state, reward) # if done: # for i in range(total_numsteps % args.num_steps): # a = i+1 # memory_g.memory.append(memory.memory[-a]) # memory_g.position += 1 state = next_state #-- training -- # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model: # for _ in range(10): # transitions_b = memory.sample(args.batch_size/2) # transitions_g = memory_g.sample(args.batch_size/2) # for i in range(transitions_g): # transitions_b.append(transitions_g[i]) # batch = Transition(*zip(*transitions_b)) # agent.update_parameters(batch) if len(memory) > args.batch_size and args.train_model: for _ in range(10): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) agent.update_parameters(batch) else: time.sleep(0.1) rate.sleep() if done or total_numsteps % args.num_steps == 0: break pub.publish([0, 0]) rewards.append(episode_reward) # -- plot Q value -- if i_episode % 10 == 0: sate_Q_plot(agent, i_episode) # -- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) if args.train_model: greedy_episode = max(args.num_episodes / 100, 5) else: greedy_episode = 10 greedy_range = min(args.greedy_steps, greedy_episode) # -- calculates episode without noise -- if i_episode % greedy_episode == 0 and not i_episode == 0: for _ in range(0, greedy_range + 1): # -- reset environment for every episode -- sim_reset() state_visited = [] action_taken = [] print("Greedy episode ongoing") state = torch.Tensor(subdata).unsqueeze(0) episode_reward = 0 steps = 0 state_plot.append([]) st = state.numpy()[0] sta = [st[0], st[1]] state_plot[_].append(sta) while True: action = agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, obs_hit = env.calc_shaped_reward(next_state) episode_reward += reward state_visited.append(state) action_taken.append(action) state = next_state steps += 1 if done or steps == args.num_steps: greedy_reward.append(episode_reward) break rate.sleep() if obs_hit: steps = 300 steps_to_goal.append(steps) # -- plot path -- if i_episode % 10 == 0: agent.plot_path(state_visited, action_taken, i_episode) upper_reward.append((np.max(greedy_reward[-greedy_range:]))) lower_reward.append((np.min(greedy_reward[-greedy_range:]))) avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:]))) avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:]))) print( "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}" .format(i_episode, total_numsteps, avg_greedy_reward[-1], np.mean(rewards[-greedy_episode:]))) #-- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: # pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) print('Training ended after {} minutes'.format( (time.time() - t_start) / 60)) print('Time per ep : {} s').format( (time.time() - t_start) / args.num_episodes) print('Mean greedy reward: {}'.format(np.mean(greedy_reward))) print('Mean reward: {}'.format(np.mean(rewards))) print('Max reward: {}'.format(np.max(rewards))) print('Min reward: {}'.format(np.min(rewards))) # -- plot learning curve -- pos_greedy = [] for pos in range(0, len(lower_reward)): pos_greedy.append(pos * greedy_episode) plt.title('Greedy policy outcome') plt.fill_between(pos_greedy, lower_reward, upper_reward, facecolor='red', alpha=0.3) plt.plot(pos_greedy, avg_greedy_reward, 'r') plt.xlabel('Number of episodes') plt.ylabel('Rewards') fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname1) plt.close() plt.title('Steps to reach goal') plt.plot(steps_to_goal) plt.ylabel('Number of steps') plt.xlabel('Number of episodes') fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname2) plt.close()
class Agent(object): def __init__(self, state_space, action_space, max_action, device): self.state_size = state_space.shape[0] self.action_size = action_space.shape[0] self.max_action = max_action self.device = device self.actor_local = Actor(state_space.shape, action_space.high.size, max_action) self.actor_target = Actor(state_space.shape, action_space.high.size, max_action) self.actor_optimizer = optimizers.Adam(LR_ACTOR) # let target be equal to local self.actor_target.set_weights(self.actor_local.get_weights()) self.critic_local = Critic(state_space.shape, action_space.high.size) self.critic_target = Critic(state_space.shape, action_space.high.size) self.critic_optimizer = optimizers.Adam(LR_CRITIC) # let target be equal to local self.critic_target.set_weights(self.critic_local.get_weights()) self.noise = OUNoise(self.action_size) self.memory = ReplayBuffer(BUFFER_SIZE) self.current_steps = 0 def step(self, state, action, reward, done, next_state, train=True) -> None: self.memory.store(state, action, reward, done, next_state) if train and self.memory.count > BATCH_SIZE and self.memory.count > MIN_MEM_SIZE: if self.current_steps % UPDATE_STEPS == 0: experiences = self.memory.sample(BATCH_SIZE) self.learn(experiences, GAMMA) self.current_steps += 1 @tf.function def critic_train(self, states, actions, rewards, dones, next_states): with tf.device(self.device): # Compute yi u_t = self.actor_target(next_states) q_t = self.critic_target([next_states, u_t]) yi = tf.cast(rewards, dtype=tf.float64) + \ tf.cast(GAMMA, dtype=tf.float64) * \ tf.cast((1 - tf.cast(dones, dtype=tf.int64)), dtype=tf.float64) * \ tf.cast(q_t, dtype=tf.float64) # Compute MSE with tf.GradientTape() as tape: q_l = tf.cast(self.critic_local([states, actions]), dtype=tf.float64) loss = (q_l - yi) * (q_l - yi) loss = tf.reduce_mean(loss) # Update critic by minimizing loss dloss_dql = tape.gradient(loss, self.critic_local.trainable_weights) self.critic_optimizer.apply_gradients( zip(dloss_dql, self.critic_local.trainable_weights)) return @tf.function def actor_train(self, states): with tf.device(self.device): with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(self.actor_local.trainable_variables) u_l = self.actor_local(states) q_l = -tf.reduce_mean(self.critic_local([states, u_l])) j = tape.gradient(q_l, self.actor_local.trainable_variables) self.actor_optimizer.apply_gradients( zip(j, self.actor_local.trainable_variables)) return def learn(self, experiences, gamma) -> None: states, actions, rewards, dones, next_states = experiences states = np.array(states).reshape(BATCH_SIZE, self.state_size) states = tf.convert_to_tensor(states) actions = np.array(actions).reshape(BATCH_SIZE, self.action_size) actions = tf.convert_to_tensor(actions) rewards = np.array(rewards).reshape(BATCH_SIZE, 1) next_states = np.array(next_states).reshape(BATCH_SIZE, self.state_size) dones = np.array(dones).reshape(BATCH_SIZE, 1) self.critic_train(states, actions, rewards, dones, next_states) self.actor_train(states) self.update_local() return def update_local(self): def soft_updates(local_model: tf.keras.Model, target_model: tf.keras.Model) -> np.ndarray: local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len(target_weights) new_weights = TAU * local_weights + (1 - TAU) * target_weights return new_weights self.actor_target.set_weights( soft_updates(self.actor_local, self.actor_target)) self.critic_target.set_weights( soft_updates(self.critic_local, self.critic_target)) def store_weights(self, episode: int) -> None: self.actor_target.save_weights( join(CKPTS_PATH, ACTOR_CKPTS, f'cp-{episode}')) self.critic_target.save_weights( join(CKPTS_PATH, CRITIC_CKPTS, f'cp-{episode}')) return def act(self, state, add_noise=True) -> (float, float): state = np.array(state).reshape(1, self.state_size) pure_action = self.actor_local.predict(state)[0] action = self.noise.get_action(pure_action) return action, pure_action def reset(self): self.noise.reset()
class Agent: """Interacts and learns from the environment""" def __init__(self, device, state_size, action_size, random_seed, fc1=128, fc2=128, lr_actor=1e-04, lr_critic=1e-04, weight_decay=0, buffer_size=100000, batch_size=64, gamma=0.99, tau=1e-3): """ Parameters ---------- brain_name (String): state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed fc1 (int): 1st fully connected layer size for model (actor & critic) fc2 (int): 2nd fully connected layer size for model (actor & critic) device: CPU/GPU lr_actor (float): learning rate for Actor lr_critic (flaot): learning rate for Critic weight_decay (float): weight decay used in model optimizer buffer_size (int): replay buffer size batch_size (int): batch size to sample from buffer gamma (float): parameter used to calculate Q target tau (float): soft update interpolation parameter """ self.device = device self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.batch_size = batch_size self.gamma = gamma self.tau = tau # Actor network (with target) self.actor_local = Actor(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(), lr=lr_actor) # Critic actor self.critic_local = Critic(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.critic_optimizer = optim.Adam( params=self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device, random_seed) self.make_copy(self.critic_local, self.critic_target) self.make_copy(self.actor_local, self.actor_target) print("Initilized agent with state size = {} and action size = {}". format(self.state_size, self.action_size)) def make_copy(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use random sample from buffer to learn """ self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.batch_size: batch = self.memory.sample() self.learn(batch) def act(self, state, add_noise=True): """ Returns actions for given state as per current policy. """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, batch): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Parameters ---------- batch (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = batch # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_target_next = self.critic_target(next_states, actions_next) # compute Q targets for next states (y_i) Q_targets = rewards + (self.gamma * Q_target_next * (1.0 - dones)) # Compute citic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimise loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Parameters ---------- local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
def fit_nash(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space) policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space) memory_vehicle = ReplayMemory(1000000) memory_attacker = ReplayMemory(1000000) memory_SL_vehicle = ReplayMemory(100000) memory_SL_attacker = ReplayMemory(100000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] eva_reward = [] ave_reward = [] eva_ac_veh = [] eva_ac_att = [] total_numsteps = 0 updates = 0 # while len(state_record) < 20: # s, _, _ = env.step(env.random_action()) # state_record.append(s) for i_episode in range(args.num_episodes): state = env.reset() if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle, param_noise_vehicle) action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker, param_noise_attacker) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()]) action_attacker = torch.Tensor([policy_attacker.predict(state.reshape(-1, 4)) / policy_attacker.predict( state.reshape(-1, 4)).sum()]) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0] next_state, reward, done = env.step(ac_v, ac_a) total_numsteps += 1 episode_reward += reward memory_SL_vehicle.append(state, ac_v) memory_SL_attacker.append(state, ac_a) action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([env.RC - reward]) memory_vehicle.push(torch.Tensor([[state]]), action_vehicle, mask, next_state, reward_vehicle) memory_attacker.push(torch.Tensor([[state]]), action_attacker, mask, next_state, reward_attacker) state = next_state.numpy()[0][0] if done: rewards.append(episode_reward) if i_episode % 100: print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) break if len(memory_vehicle) > args.batch_size: # 开始训练 # print('begin training') for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle) value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if i_episode % 10 == 0: state = env.reset() evaluate_reward = 0 while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle, param_noise_vehicle) action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker, param_noise_attacker) else: action_vehicle = torch.Tensor([policy_vehicle.predict( state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()]) action_attacker = torch.Tensor([policy_attacker.predict( state.reshape(-1, 4)) / policy_attacker.predict(state.reshape(-1, 4)).sum()]) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0] next_state, reward, done = env.step(ac_v, ac_a) total_numsteps += 1 evaluate_reward += reward state = next_state[0] if done: average_reward = np.mean(rewards[-10:]) print("{} % Episode finished, total numsteps: {}, reward: {}, average reward: {}".format( i_episode / args.num_episodes * 100, total_numsteps, evaluate_reward, average_reward)) eva_reward.append(evaluate_reward) ave_reward.append(average_reward) print(ac_v[0]) eva_ac_veh.append((ac_v[0] + 1) / sum(ac_v[0] + 1)) eva_ac_att.append((ac_a[0] + 1) / sum(ac_a[0] + 1)) break # writer.add_scalar('reward/test', episode_reward, i_episode) env.close() f = plt.figure() plt.plot(eva_reward, label='Eva_reward') plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.show() AC_veh = np.array(eva_ac_veh) AC_att = np.array(eva_ac_att) # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_veh[:, 0], label='Bacon1') plt.plot(AC_veh[:, 1], label='Bacon2') plt.plot(AC_veh[:, 2], label='Bacon3') plt.plot(AC_veh[:, 3], label='Bacon4') # plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.savefig('Veh_result.png', ppi=300) plt.show()
class DDPG(): """DDPG agent with own actor and critic.""" def __init__(self, agent_id, model, action_size=2, seed=0): """Initialize an Agent object. """ self.seed = random.seed(seed) self.id = agent_id self.action_size = action_size # Actor Network self.actor_local = model.actor_local self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network self.critic_local = model.critic_local self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Set weights for local and target actor, respectively, critic the same self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, seed) def hard_copy_weights(self, target, source): """ copy weights from source to target network (part of initialization)""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def act(self, state, noise_weight=1.0, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: self.noise_val = self.noise.sample() * noise_weight action += self.noise_val return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, agent_id, experiences, gamma, all_next_actions, all_actions): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # get predicted next-state actions and Q values from target models self.critic_optimizer.zero_grad() agent_id = torch.tensor([agent_id]).to(device) actions_next = torch.cat(all_next_actions, dim=1).to(device) with torch.no_grad(): q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states (y_i) q_expected = self.critic_local(states, actions) # q_targets = reward of this timestep + discount * Q(st+1,at+1) from target network q_targets = rewards.index_select( 1, agent_id) + (gamma * q_targets_next * (1 - dones.index_select(1, agent_id))) # compute critic loss critic_loss = F.mse_loss(q_expected, q_targets.detach()) # minimize loss critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # compute actor loss self.actor_optimizer.zero_grad() # detach actions from other agents actions_pred = [ actions if i == self.id else actions.detach() for i, actions in enumerate(all_actions) ] actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, num_agents, device, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, random_seed=0): """ Initialize an Agent object. :param state_size: size of state :param action_size: size of action :param num_agents: number of agents :param gamma: discount factor :param tau: factor for soft update of target parameters :param lr_actor: Learning rate of actor :param lr_critic: Learning rate of critic :param random_seed: Random seed :param device: cuda or cpu """ self.device = device self.gamma = gamma self.tau = tau self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.full_state_size = state_size * num_agents self.full_action_size = action_size * num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, device, random_seed).to(device) self.actor_target = Actor(state_size, action_size, device, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_target = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0) self.noise = OUNoise(action_size, random_seed) def save_model(self, agent_number): torch.save(self.actor_local.state_dict(), f'models/checkpoint_actor_{agent_number}.pth') torch.save(self.critic_local.state_dict(), f'models/checkpoint_critic_{agent_number}.pth') def load_model(self, agent_number): checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth', map_location=torch.device('cpu')) self.actor_local.load_state_dict(checkpoint) checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth', map_location=torch.device('cpu')) self.critic_local.load_state_dict(checkpoint) def act(self, state, noise=0., train=False): """Returns actions for given state as per current policy. :param state: state as seen from single agent """ if train is True: self.actor_local.train() else: self.actor_local.eval() action = self.actor_local(state) if noise > 0: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def target_act(self, state, noise=0.): #self.actor_target.eval() # convert to cpu() since noise is in cpu() self.actor_target.eval() action = self.actor_target(state).cpu() if noise > 0.: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def update_critic(self, rewards, dones, all_states, all_actions, all_next_states, all_next_actions): with torch.no_grad(): Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss q_expected = self.critic_local(all_states, all_actions) # critic_loss = F.mse_loss(q_expected, q_targets) critic_loss = ((q_expected - q_targets.detach())**2).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() def update_actor(self, all_states, all_predicted_actions): """Update actor network :param all_states: all states :param all_predicted_actions: all predicted actions """ actor_loss = -self.critic_local(all_states, all_predicted_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer.step() def update_targets(self): self.soft_update(self.actor_local, self.actor_target, self.tau) self.soft_update(self.critic_local, self.critic_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): self.noise.reset()
def fit_nash(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space) policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space) memory_vehicle = ReplayMemory(1000000) memory_attacker = ReplayMemory(1000000) memory_SL_vehicle = ReplayMemory(100000) memory_SL_attacker = ReplayMemory(100000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] eva_reward = [] ave_reward = [] tra_ac_veh = [] tra_ac_att = [] All_reward=[] total_numsteps = 0 updates = 0 state_record = [env.reset()] # while len(state_record) < 20: # s, _, _ = env.step(*env.random_action()) # state_record.append(s) # print(torch.Tensor([state_record[-20:]]).shape) for i_episode in range(args.num_episodes): local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.array([b])) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: if random.random() < ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # print('sl', action_vehicle.shape) # print('sl', action_vehicle.shape) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy(), action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy() next_state, reward, done = env.step(ac_v, ac_a) # print('tra_reward', reward) # print(np.shape(state_record), next_state[0].shape) state_record.append(next_state) local_steps += 1 total_numsteps += 1 episode_steps += 1 episode_reward += reward # print('sl-mem',state.shape,ac_v.shape) # print('sl state mem', state.shape, ac_a.shape) memory_SL_vehicle.append(state_record[-1], ac_v) memory_SL_attacker.append(state_record[-1], ac_a) action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) # print(prev_state.shape, next_state.shape) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([env.RC - reward]) # print(state_record[-20:]) # print(torch.Tensor([state_record[-20:]]).shape) memory_vehicle.push(prev_state, action_vehicle, mask, next_state, reward_vehicle) memory_attacker.push(prev_state, action_attacker, mask, next_state, reward_attacker) state = next_state.numpy()[0] # print(state_record[-1].shape) if done: rewards.append(episode_reward) if i_episode % 100: print('Episode {} ends, local_steps {}. total_steps {}, instant ave-reward is {:.4f}'.format( i_episode, local_steps, total_numsteps, episode_reward)) break if len(memory_vehicle) > args.batch_size: # 开始训练 # print('begin training') for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) # print(batch_vehicle) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle) value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if i_episode % 10 == 0 and i_episode > 0: state = env.reset() state_record = [np.array([state])] while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.array([b])) local_steps += 1 state_record.append(s) evaluate_reward = 0 while True: # la = np.random.randint(0, len(state_record) - 20, 1)[0] if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor([policy_vehicle.predict( state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor([policy_attacker.predict( state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy() next_state, reward, done = env.step(ac_v, ac_a) real_ac_v = ac_v[0].clip(-1, 1) + 1 tra_ac_veh.append(real_ac_v / (sum(real_ac_v) + 0.0000001)) tra_ac_att.append(ac_a[0]) state_record.append(next_state) total_numsteps += 1 local_steps += 1 # print('eva_reward', reward) evaluate_reward += reward state = next_state[0] if done: average_reward = np.mean(rewards[-10:]) print("{} % Episode finished, total numsteps: {}, eva-reward: {}, average reward: {}".format( i_episode / args.num_episodes * 100, total_numsteps, evaluate_reward, average_reward)) eva_reward.append(evaluate_reward) ave_reward.append(average_reward) # print(ac_v[0]) break # writer.add_scalar('reward/test', episode_reward, i_episode) env.close() df = pd.DataFrame() df['Eva'] = pd.Series(eva_reward) df['Tra'] = pd.Series(ave_reward) df2 = pd.DataFrame() df2['Weight'] = pd.Series(tra_ac_veh) df2['Attack'] = pd.Series(tra_ac_att) df.to_csv('./Result/reward_result_30.csv', index=None) df2.to_csv('./Result/action_result_30.csv', index=None) # np.savetxt('./Result/eva_result.csv', eva_reward, delimiter=',') # np.savetxt('./Result/ave_result.csv', ave_reward, delimiter=',') f = plt.figure() plt.plot(rewards[5:], label='Eva_reward') plt.show() AC_veh = np.array(tra_ac_veh) AC_att = np.array(tra_ac_att) # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_veh[:, 0], label='Bacon1', alpha=0.2) plt.plot(AC_veh[:, 1], label='Bacon2', alpha=0.2) plt.plot(AC_veh[:, 2], label='Bacon3', alpha=0.2) plt.plot(AC_veh[:, 3], label='Bacon4', alpha=0.2) # plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.savefig('./Result/Veh_result_30.png', ppi=300) plt.show() # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_att[:, 0], label='Attack1', alpha=0.2) plt.plot(AC_att[:, 1], label='Attack2', alpha=0.2) plt.plot(AC_att[:, 2], label='Attack3', alpha=0.2) plt.plot(AC_att[:, 3], label='Attack4', alpha=0.2) # plt.plot(ave_reward, label='Tra_ave_reward') # plt.title('') plt.legend() plt.savefig('./Result/Att_result_30.png', ppi=300) plt.show()
class Agent(object): """ The Agent interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed=0, params=params): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.params = params # Actor (Policy) Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_target = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.params['LR_ACTOR']) # Critic (Value) Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_target = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.params['LR_CRITIC'], weight_decay=self.params['WEIGHT_DECAY']) # Initialize target and local to same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'], self.params['BATCH_SIZE'], random_seed) def hard_update(self, local_model, target_model): """ Hard update model parameters. """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, states, actions, rewards, next_states, dones): """ Save experiences in replay memory and use random sample from buffer to learn. """ # Save experience / reward, cater for when multiples for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn if enough samples are available in memory if len(self.memory) > self.params['BATCH_SIZE']: experiences = self.memory.sample() self.learn(experiences, self.params['GAMMA']) def act(self, states, add_noise=True): """ Returns actions for a given state as per current policy. """ states = torch.from_numpy(states).float().to(self.params['DEVICE']) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma=params['GAMMA']): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update Critic(Value) # Get predicted next-state actions and Q-Values from target Network actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q Targe for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_( self.critic_local.parameters(), 1) # Stabilize learning per bernchmark guidelines self.critic_optimizer.step() # Update Actor (Policy) # Compute Actor Loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, tau=self.params['TAU']) self.soft_update(self.actor_local, self.actor_target, tau=self.params['TAU']) def soft_update(self, local_model, target_model, tau=params['TAU']): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.session = K.get_session() init = tf.global_variables_initializer() self.session.run(init) self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.score = -math.inf self.best_score = -math.inf self.last_loss = math.inf # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.noise_scale = (self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() self.total_reward = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward # Learn, if enough samples are available in memory print("Memory Size: {}, Batch Size: {}".format(len(self.memory), self.batch_size)) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" #state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(np.array([state]))[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): print("Fitting model iteration ...") """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.array([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.array( [e.next_state for e in experiences if e is not None]) print("Next states shape: {}".format(next_states.shape)) self.score = rewards.mean() self.best_score = max(self.score, self.best_score) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) r = self.actor_local.train_fn([states, action_gradients, 1]) self.last_loss = np.mean(-action_gradients * actions) # custom training function Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) if __name__ == "__main__": state_size = (84, 296, 9) action_low = np.array([1, 0, 1]) action_high = np.array([10, 359, 2000]) net = Actor(state_size, 3, action_low, action_high) #net = Critic(state_size, 3) net.model.summary()
class ddpg_agent: def __init__(self, args, env): self.args = args self.env = env # get the number of inputs... num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] self.action_scale = self.env.action_space.high[0] # build up the network self.actor_net = Actor(num_inputs, num_actions) self.critic_net = Critic(num_inputs, num_actions) # get the target network... self.actor_target_net = Actor(num_inputs, num_actions) self.critic_target_net = Critic(num_inputs, num_actions) if self.args.cuda: self.actor_net.cuda() self.critic_net.cuda() self.actor_target_net.cuda() self.critic_target_net.cuda() # copy the parameters.. self.actor_target_net.load_state_dict(self.actor_net.state_dict()) self.critic_target_net.load_state_dict(self.critic_net.state_dict()) # setup the optimizer... self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), lr=self.args.actor_lr) self.optimizer_critic = torch.optim.Adam( self.critic_net.parameters(), lr=self.args.critic_lr, weight_decay=self.args.critic_l2_reg) # setting up the noise self.ou_noise = OUNoise(num_actions) # check some dir if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) self.model_path = self.args.save_dir + self.args.env_name + '/' if not os.path.exists(self.model_path): os.mkdir(self.model_path) # start to train the network.. def learn(self): # init the brain memory replay_buffer = [] total_timesteps = 0 running_reward = None for episode_idx in range(self.args.max_episode): state = self.env.reset() # get the scale of the ou noise... self.ou_noise.scale = (self.args.noise_scale - self.args.final_noise_scale) * max(0, self.args.exploration_length - episode_idx) / \ self.args.exploration_length + self.args.final_noise_scale self.ou_noise.reset() # start the training reward_total = 0 while True: state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) if self.args.cuda: state_tensor = state_tensor.cuda() with torch.no_grad(): policy = self.actor_net(state_tensor) # start to select the actions... actions = self._select_actions(policy) # step state_, reward, done, _ = self.env.step(actions * self.action_scale) total_timesteps += 1 reward_total += reward # start to store the samples... replay_buffer.append((state, reward, actions, done, state_)) # check if the buffer size is outof range if len(replay_buffer) > self.args.replay_size: replay_buffer.pop(0) if len(replay_buffer) > self.args.batch_size: mini_batch = random.sample(replay_buffer, self.args.batch_size) # start to update the network _, _ = self._update_network(mini_batch) if done: break state = state_ running_reward = reward_total if running_reward is None else running_reward * 0.99 + reward_total * 0.01 if episode_idx % self.args.display_interval == 0: torch.save(self.actor_net.state_dict(), self.model_path + 'model.pt') print('[{}] Episode: {}, Frames: {}, Rewards: {}'.format( datetime.now(), episode_idx, total_timesteps, running_reward)) self.env.close() # select actions def _select_actions(self, policy): actions = policy.detach().cpu().numpy()[0] actions = actions + self.ou_noise.noise() actions = np.clip(actions, -1, 1) return actions # update the network def _update_network(self, mini_batch): state_batch = np.array([element[0] for element in mini_batch]) state_batch = torch.tensor(state_batch, dtype=torch.float32) # reward batch reward_batch = np.array([element[1] for element in mini_batch]) reward_batch = torch.tensor(reward_batch, dtype=torch.float32).unsqueeze(1) # done batch done_batch = np.array([int(element[3]) for element in mini_batch]) done_batch = 1 - done_batch done_batch = torch.tensor(done_batch, dtype=torch.float32).unsqueeze(1) # action batch actions_batch = np.array([element[2] for element in mini_batch]) actions_batch = torch.tensor(actions_batch, dtype=torch.float32) # next stsate state_next_batch = np.array([element[4] for element in mini_batch]) state_next_batch = torch.tensor(state_next_batch, dtype=torch.float32) # check if use the cuda if self.args.cuda: state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() done_batch = done_batch.cuda() actions_batch = actions_batch.cuda() state_next_batch = state_next_batch.cuda() # update the critic network... with torch.no_grad(): actions_out = self.actor_target_net(state_next_batch) expected_q_value = self.critic_target_net(state_next_batch, actions_out) # get the target value target_value = reward_batch + self.args.gamma * expected_q_value * done_batch target_value = target_value.detach() values = self.critic_net(state_batch, actions_batch) critic_loss = (target_value - values).pow(2).mean() self.optimizer_critic.zero_grad() critic_loss.backward() self.optimizer_critic.step() # start to update the actor network actor_loss = -self.critic_net(state_batch, self.actor_net(state_batch)).mean() self.optimizer_actor.zero_grad() actor_loss.backward() self.optimizer_actor.step() # then, start to softupdate the network... self._soft_update_target_network(self.critic_target_net, self.critic_net) self._soft_update_target_network(self.actor_target_net, self.actor_net) return actor_loss.item(), critic_loss.item() # soft update the network def _soft_update_target_network(self, target, source): # update the critic network firstly... for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) # functions to test the network def test_network(self): model_path = self.args.save_dir + self.args.env_name + '/model.pt' self.actor_net.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) self.actor_net.eval() # start to test for _ in range(5): state = self.env.reset() reward_sum = 0 while True: self.env.render() state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) with torch.no_grad(): actions = self.actor_net(state) actions = actions.detach().numpy()[0] state_, reward, done, _ = self.env.step(self.action_scale * actions) reward_sum += reward if done: break state = state_ print('The reward of this episode is {}.'.format(reward_sum)) self.env.close()
def main(): sess = tf.Session() K.set_session(sess) env = gym.make("MountainCarContinuous-v0") #Parameters memory_size = 100000 batch_size = 32 tau = 0.001 lr_actor = 0.0001 lr_critic = 0.001 discount_factor = 0.99 episodes = 1001 time_steps = 501 collect_experience = 50000 save_frequency = 250 ep_reward = [] training = False #Noise objecct noise = OUNoise(env.action_space) #Initialize actor and critic objects actor = Actor(env, sess, lr_actor, tau) #Uncomment to the following line to save the actor model architecture as json file. Need to be saved #once only # actor.save_model_architecture("Actor_model_architecture.json") critic = Critic(env, sess, lr_critic, tau, discount_factor) #Initialize replay memory of size defined by memory_size replay_memory = ReplayMemory(memory_size) #Toggle between true and false for debugging purposes. For training it is always true run = True if run: #Loop over the number of episodes. At eqach new episode reset the environment, reset the noise #state and set total episode reward to 0 for episode in range(episodes): state = env.reset() noise.reset() episode_reward = 0 #Loop over the number of steps in an episode for time in range(time_steps): #Uncomment the following line of you want to visualize the mountain car during training. #Can also be trained without visualization for the case where we are using #position and velocities as state variables. # env.render() #Predict an action from the actor model using the current state action = actor.predict_action(state.reshape((1, 2)))[0] #Add ohlnbeck noise to the predicted action to encourage exploration of the environment exploratory_action = noise.get_action(action, time) #Take the noisy action to enter the next state next_state, reward, done, _ = env.step(exploratory_action) #Predict the action to be taken given the next_state. This next state action is predicted #using the actor's target model next_action = actor.predict_next_action( next_state.reshape((1, 2)))[0] #Append this experience sample to the replay memory replay_memory.append(state, exploratory_action, reward, next_state, next_action, done) #Only start training when there are a minimum number of experience samples available in #memory if replay_memory.count() == collect_experience: training = True print('Start training') #When training: if training: # 1)first draw a random batch of samples from the replay memory batch = replay_memory.sample(batch_size) # 2) using this sample calculate dQ/dA from the critic model grads = critic.calc_grads(batch) # 3) calculate dA/dTheta from the actor using the same batch # 4) multiply dA/dTheta by negative dQ/dA to get dJ/dTheta # 5) Update actor weights such that dJ/dTheta is maximized # 6) The above operation is easily performed by minimizing the value obtained in (4) t_grads = actor.train(batch, grads) # update critic weights by minimizing the bellman loss. Use actor target to compute # next action in the next state (already computed and stored in replay memory) # in order to compute TD target critic.train(batch) #After each weight update of the actor and critic online model perform soft updates # of their targets so that they can smoothly and slowly track the online model's #weights actor.update_target() critic.update_target() #Add each step reward to the episode reward episode_reward += reward #Set current state as next state state = next_state #If target reached before the max allowed time steps, break the inner for loop if done: break #Store episode reward ep_reward.append([episode, episode_reward]) #Print info for each episode to track training progress print( "Completed in {} steps.... episode: {}/{}, episode reward: {} " .format(time, episode, episodes, episode_reward)) #Save model's weights and episode rewards after each save_frequency episode if training and (episode % save_frequency) == 0: print('Data saved at epsisode:', episode) actor.save_weights( './Model/DDPG_actor_model_{}.h5'.format(episode)) pickle.dump( ep_reward, open('./Rewards/rewards_{}.dump'.format(episode), 'wb')) # Close the mountain car environment env.close()
class Agent(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # actor policy model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # critic value model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.25 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.9 # discount rate self.tau = 0.1 # soft update parameter self.total_reward = 0 self.count = 0 self.score = 0 self.best_score = -np.inf self.reset_episode() def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # keep track of rewards self.total_reward += reward self.count += 1 # save experience/reward self.memory.add(self.last_state, action, reward, next_state, done) # if there are enough experiences, learn from them if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, states): # returns action for a given state(s) as per the current policy state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): self.score = self.total_reward / float( self.count) if self.count else 0.0 # update the policy and value parameters given batch of experience tuples states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # get predicted next state and Q values from target models next_actions = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, next_actions]) # compute Q targets for current state and train local critic model Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # train local actor model action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom train function # soft update target models self.soft_update(self.actor_local.model, self.actor_target.model) self.soft_update(self.critic_local.model, self.critic_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
def main(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) vehicle_memory = ReplayMemory(1000000) attacker_memory = ReplayMemory(1000000) vehicle_ounoise = OUNoise(env.vehicle_action_space) if args.ou_noise else None attacker_ounoise = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([[env.reset()]]) # 4-dimensional velocity observation if args.ou_noise: vehicle_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale vehicle_ounoise.reset() attacker_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale attacker_ounoise.reset() episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) total_numsteps += 1 episode_reward += reward action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([-reward]) reward_attacker = torch.Tensor([env.RC+reward]) vehicle_memory.push(state, action_vehicle, mask, next_state, reward_vehicle) attacker_memory.push(state, action_attacker, mask, next_state, reward_attacker) state = next_state if len(vehicle_memory) > args.batch_size: for _ in range(args.updates_per_step): transitions_vehicle = vehicle_memory.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transition_attacker = attacker_memory.sample(args.batch_size) batch_attacker = Transition(*zip(*transition_attacker)) value_loss_1, policy_loss_1 = agent_vehicle.update_parameters(batch_vehicle) value_loss_2, policy_loss_2 = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if done: break # writer.add_scalar('reward/train', episode_reward, i_episode) # Update param_noise based on distance metric if args.param_noise: episode_transitions_vehicle = vehicle_memory.memory[vehicle_memory.position - t:vehicle_memory.position] states_vehicle = torch.cat([transition[0] for transition in episode_transitions_vehicle], 0) unperturbed_actions_vehicle = agent_vehicle.select_action(states_vehicle, None, None) perturbed_actions_vehicle = torch.cat([transition[1] for transition in episode_transitions_vehicle], 0) ddpg_dist_vehicle = ddpg_distance_metric(perturbed_actions_vehicle.numpy(), unperturbed_actions_vehicle.numpy()) param_noise_vehicle.adapt(ddpg_dist_vehicle) episode_transitions_attacker = attacker_memory.memory[attacker_memory.position - t:attacker_memory.position] states_attacker = torch.cat([transition[0] for transition in episode_transitions_attacker], 0) unperturbed_actions_attacker = agent_attacker.select_action(states_attacker, None, None) perturbed_actions_attacker = torch.cat([transition[1] for transition in episode_transitions_attacker], 0) ddpg_dist_attacker = ddpg_distance_metric(perturbed_actions_attacker.numpy(), unperturbed_actions_attacker.numpy()) param_noise_attacker.adapt(ddpg_dist_attacker) rewards.append(episode_reward) if i_episode % 10 == 0: state = torch.Tensor([[env.reset()]]) episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) episode_reward += reward next_state = torch.Tensor([[next_state]]) state = next_state if done: break # writer.add_scalar('reward/test', episode_reward, i_episode) rewards.append(episode_reward) print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) env.close()
class DDPG(object): """Interacts with and learns from the environment. There are two agents and the observations of each agent has 24 dimensions. Each agent's action has 2 dimensions. Will use two separate actor networks (one for each agent using each agent's observations only and output that agent's action). The critic for each agents gets to see the actions and observations of all agents. """ def __init__(self, state_size, action_size, num_agents): """Initialize an Agent object. Params ====== state_size (int): dimension of each state for each agent action_size (int): dimension of each action for each agent """ self.state_size = state_size self.action_size = action_size # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(DEVICE) self.actor_target = Actor(state_size, action_size).to(DEVICE) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR, weight_decay=WEIGHT_DECAY_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_target = Critic(num_agents * state_size, num_agents * action_size).to(DEVICE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY_critic) # Noise process self.noise = OUNoise(action_size) #single agent only self.noise_scale = NOISE_START # Make sure target is initialized with the same weight as the source (makes a big difference) self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) def act(self, states, i_episode, add_noise=True): """Returns actions for given state as per current policy.""" if i_episode > EPISODES_BEFORE_TRAINING and self.noise_scale > NOISE_END: #self.noise_scale *= NOISE_REDUCTION self.noise_scale = NOISE_REDUCTION**(i_episode - EPISODES_BEFORE_TRAINING) #else keep the previous value if not add_noise: self.noise_scale = 0.0 states = torch.from_numpy(states).float().to(DEVICE) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() #add noise actions += self.noise_scale * self.add_noise2( ) #works much better than OU Noise process #actions += self.noise_scale*self.noise.sample() return np.clip(actions, -1, 1) def add_noise2(self): noise = 0.5 * np.random.randn( 1, self.action_size ) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped return noise def reset(self): self.noise.reset() def learn(self, experiences, gamma): #for MADDPG """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ full_states, actor_full_actions, full_actions, agent_rewards, agent_dones, full_next_states, critic_full_next_actions = experiences # ---------------------------- update critic ---------------------------- # # Get Q values from target models Q_target_next = self.critic_target(full_next_states, critic_full_next_actions) # Compute Q targets for current states (y_i) Q_target = agent_rewards + gamma * Q_target_next * (1 - agent_dones) # Compute critic loss Q_expected = self.critic_local(full_states, full_actions) critic_loss = F.mse_loss( input=Q_expected, target=Q_target ) #target=Q_targets.detach() #not necessary to detach # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1.0) #clip the gradient for the critic network (Udacity hint) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actor_loss = -self.critic_local.forward( full_states, actor_full_actions).mean( ) #-ve b'cse we want to do gradient ascent # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() def soft_update_all(self): # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, source_param in zip(target.parameters(), source.parameters()): target_param.data.copy_(source_param.data)
class DDPG: def __init__(self, task): # Hyper parameters self.learning_rate_actor = 1e-4 self.learning_rate_critic = 1e-3 self.gamma = 0.99 self.tau = 0.001 # Define net self.sess = tf.Session() self.task = task self.actor = ActorNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_actor, \ self.task.action_low, self.task.action_high, self.tau) self.critic = CriticNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_critic, self.tau) # Define noise self.mu = 0 self.theta = 0.15 self.sigma = 0.20 self.noise = OUNoise(self.task.action_size, self.mu, self.theta, self.sigma) # Define memory replay self.buffer_size = 1000000 self.batch_size = 64 self.memory = Replay(self.buffer_size, self.batch_size) # Score self.best_score = -np.inf self.best_reward = -np.inf def reset(self): self.noise.reset() state = self.task.reset() self.last_state = state self.total_reward = 0.0 self.count = 0 return state def learn(self, experience): # Turn into different np arrays state_batch = np.vstack([e[0] for e in experience]) action_batch = np.vstack([e[1] for e in experience]) reward_batch = np.vstack([e[2] for e in experience]) next_state_batch = np.vstack([e[3] for e in experience]) done_batch = np.vstack([e[4] for e in experience]) # Calculate next_state q value next_action_batch = self.actor.target_actions(next_state_batch) next_q_targets = self.critic.targetQ(next_state_batch, next_action_batch) # Train critic net q_targets = reward_batch + self.gamma * next_q_targets * (1 - done_batch) self.critic.train(state_batch, action_batch, q_targets) # Train actor net action_gradients = self.critic.gradients(state_batch, action_batch) self.actor.train(action_gradients, state_batch) # Update target network self.actor.update_target(False) self.critic.update_target(False) def step(self, action, reward, next_state, done): self.memory.add([self.last_state, action, reward, next_state, done]) self.total_reward += reward self.count += 1 if done: self.score = self.total_reward / float(self.count) if self.count else 0.0 self.best_score = max(self.best_score, self.score) self.best_reward = max(self.total_reward, self.best_reward) if len(self.memory.buffer) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, states): states = np.reshape(states, [-1, self.task.state_size]) action = self.actor.actions(states)[0] return list(action + self.noise.sample())
class Agent: """Interacts with and learns from the environment.""" def __init__( self, num_agents, state_size, action_size, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, random_seed=2, ): """Initialize an Agent object. Params ====== num_agents (int): number of agents state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer( action_size=action_size, buffer_size=buffer_size, batch_size=batch_size, seed=random_seed, ) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # clip gradients at 1 self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: ''' DDPG Agent implementation ''' def __init__(self, agent_id, state_size, action_size, rand_seed, meta_agent): """ Creates a new DDPG Agent """ self.agent_id = agent_id self.action_size = action_size # Defines the Actor Networks self.actor_local = Actor(state_size, action_size, rand_seed).to(device) self.actor_target = Actor(state_size, action_size, rand_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Defines the Critic Networks self.critic_local = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_target = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=LR_CRITIC) #, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, rand_seed) # Refers to the MA agent memory self.memory = meta_agent.memory self.t_step = 0 def step(self): # Takes an step self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): (states_list, actions_list, rewards, next_states_list, dones) = experiences # Get the target actions for all the states l_all_next_actions = [] for states in states_list: l_all_next_actions.append(self.actor_target(states)) # Convert the experiences into Torch tensors all_next_actions = torch.cat(l_all_next_actions, dim=1).to(device) all_next_states = torch.cat(next_states_list, dim=1).to(device) all_states = torch.cat(states_list, dim=1).to(device) all_actions = torch.cat(actions_list, dim=1).to(device) Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Calculates the Q function using all the next states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # --------------------------- update actor --------------------------- actions_pred = [] for states in states_list: actions_pred.append(self.actor_local(states)) actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(all_states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------------- update target networks ---------------------- self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def act(self, states, add_noise=True): """ Returns the actions to take by the agent""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """ Performs the softupdate """ iter_params = zip(target_model.parameters(), local_model.parameters()) for target_param, local_param in iter_params: tensor_aux = tau * local_param.data + (1.0 - tau) * target_param.data target_param.data.copy_(tensor_aux) def reset(self): self.noise.reset()
def main(): parser = argparse.ArgumentParser(description='PyTorch X-job') parser.add_argument('--env_name', default="Pendulum-v0", help='name of the environment') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, help='discount factor for model (default: 0.001)') parser.add_argument('--ou_noise', type=bool, default=True) parser.add_argument('--noise_scale', type=float, default=0.4, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.4)') parser.add_argument('--exploration_end', type=int, default=33, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=200, metavar='N', help='batch size (default: 512)') parser.add_argument('--num_steps', type=int, default=100, metavar='N', help='max episode length (default: 300)') parser.add_argument('--num_episodes', type=int, default=5000, metavar='N', help='number of episodes (default: 5000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='hidden size (default: 128)') parser.add_argument('--updates_per_step', type=int, default=5, metavar='N', help='model updates per simulator step (default: 50)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--save_agent', type=bool, default=True, help='save model to file') parser.add_argument('--train_model', type=bool, default=True, help='Training or run') parser.add_argument('--load_agent', type=bool, default=False, help='load model from file') parser.add_argument('--load_exp', type=bool, default=False, help='load saved experience') parser.add_argument('--greedy_steps', type=int, default=10, metavar='N', help='amount of times greedy goes (default: 10)') args = parser.parse_args() env = ManipulateEnv() #env = gym.make(args.env_name) writer = SummaryWriter('runs/') env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # -- initialize agent -- agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) # -- declare memory buffer and random process N memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None # -- load existing model -- if args.load_agent: agent.load_model(args.env_name, args.batch_size, args.num_episodes, '.pth') print("agent: naf_{}_{}_{}_{}, is loaded".format( args.env_name, args.batch_size, args.num_episodes, '.pth')) # -- load experience buffer -- if args.load_exp: with open( '/home/quantao/Workspaces/catkin_ws/src/panda_demos/naf_env/src/exp_replay.pk1', 'rb') as input: memory.memory = pickle.load(input) memory.position = len(memory) rewards = [] total_numsteps = 0 updates = 0 #env.init_ros() #env.reset() t_start = time.time() for i_episode in range(args.num_episodes + 1): # -- reset environment for every episode -- #state = env.reset() state = torch.Tensor([env.reset()]) # -- initialize noise (random process N) -- if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode / args.exploration_end + args.final_noise_scale) ounoise.reset() episode_reward = 0 while True: # -- action selection, observation and store transition -- action = agent.select_action( state, ounoise) if args.train_model else agent.select_action(state) next_state, reward, done, info = env.step(action) #env.render() total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) next_state = torch.Tensor([next_state]) #print('reward:', reward) memory.push(state, action, mask, next_state, reward) state = next_state #else: # time.sleep(0.005) #env.render() #time.sleep(0.005) #env.rate.sleep() if done or total_numsteps % args.num_steps == 0: break if len(memory) >= args.batch_size and args.train_model: env.reset() print("Training model") for _ in range(args.updates_per_step * args.num_steps): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) value_loss, policy_loss = agent.update_parameters(batch) writer.add_scalar('loss/value', value_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 writer.add_scalar('reward/train', episode_reward, i_episode) print("Train Episode: {}, total numsteps: {}, reward: {}".format( i_episode, total_numsteps, episode_reward)) rewards.append(episode_reward) greedy_numsteps = 0 if i_episode % 10 == 0: #state = env.reset() state = torch.Tensor([env.reset()]) episode_reward = 0 while True: action = agent.select_action(state) next_state, reward, done, info = env.step(action) episode_reward += reward greedy_numsteps += 1 #state = next_state state = torch.Tensor([next_state]) #env.render() #time.sleep(0.01) # env.rate.sleep() if done or greedy_numsteps % args.num_steps == 0: break writer.add_scalar('reward/test', episode_reward, i_episode) rewards.append(episode_reward) print( "Episode: {}, total numsteps: {}, reward: {}, average reward: {}" .format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) #-- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, args.num_episodes, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) print('Training ended after {} minutes'.format( (time.time() - t_start) / 60)) print('Time per episode: {} s'.format( (time.time() - t_start) / args.num_episodes)) print('Mean reward: {}'.format(np.mean(rewards))) print('Max reward: {}'.format(np.max(rewards))) print('Min reward: {}'.format(np.min(rewards)))
# Add expert data into replay buffer from expert_data import generate_Data replay_buffer = generate_Data(env, 300, "random", replay_buffer) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env_name, args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 model_save_path = "kinova_gripper_learning{}.pt".format(args.model) noise = OUNoise(4) noise.reset() expl_noise = OUNoise(4, sigma=0.001) expl_noise.reset() for t in range(int(args.max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < args.start_timesteps: # action = env.action_space.sample() # action = noise.noise().clip(-max_action, max_action) obs = torch.FloatTensor(np.array(state).reshape(1, -1)).to(device) action = pretrained_network(obs).cpu().data.numpy().flatten() else: # action = (
ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() if args.param_noise and args.algo == "DDPG": agent.perturb_actor_parameters(param_noise) episode_reward = 0 while True: action = agent.select_action(state, ounoise, param_noise) next_state, reward, done, _ = env.step(action.numpy()[0]) total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward])
class Agent(): """ DDPG Agent, interacts with environment and learns from environment """ def __init__(self, device, state_size, n_agents, action_size, random_seed, \ buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, \ learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'): # Set Computational device self.DEVICE = device # Init State, action and agent dimensions self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) self.l_step = 0 self.log_interval = 200 # Init Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num # Init Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Init Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Init Noise Process self.noise = OUNoise((n_agents, action_size), random_seed, mu=0., theta=ou_theta, sigma=ou_sigma) # Init Replay Memory self.memory = ReplayBuffer(device, action_size, buffer_size, batch_size, random_seed) # think def act(self, states, add_noise=True): """ Decide what action to take next """ # evaluate state through actor_local states = torch.from_numpy(states).float().to(self.DEVICE) actions = np.zeros((self.n_agents, self.action_size)) self.actor_local.eval() # put actor_local network in "evaluation" mode with torch.no_grad(): for n, state in enumerate(states): actions[n, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # put actor_local back into "training" mode # add noise for better performance if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) # embody def step(self, t, s, a, r, s_, done): """ Commit step into the brain """ # Save SARS' to replay buffer --- state-action-reward-next_state tuple for n in range(self.n_agents): # self.memory.add(s, a, r, s_, done) # print ("going to learn 10 times") self.memory.add(s[n], a[n], r[n], s_[n], done[n]) if t % self.LEARN_INTERVAL != 0: return # Learn (if enough samples are available in memory ) if len(self.memory) > self.BATCH_SIZE: # print ("going to learn 10 times") for _ in range(self.LEARN_NUM): experiences = self.memory.sample() # get a memory sample self.learn(experiences, self.GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Learn from experiences, with discount factor gamma Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------ Update Critic ------ # # get predicted next-state actions and Q values from target networks actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ------ Update Actor ------ # # compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------ Update Target Networks ------ # self.soft_update(self.critic_local, self.critic_target, self.TAU) self.soft_update(self.actor_local, self.actor_target, self.TAU) # keep count of steps taken # self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters self.score = 0 self.best_score = -np.inf self.noise_scale = 0.1 def reset_episode(self): self.noise.reset() self.total_reward = 0.0 self.count = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score self.noise_scale = max(0.5 * self.noise_scale, 0.01) else: self.noise_scale = min(2.0 * self.noise_scale, 3.2) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class DDPGAgent: def __init__(self, config, state_size, action_size): super(DDPGAgent, self).__init__() l1 = config['network']['hidden'] l2 = int(config['network']['hidden'] / 2) self.actor = Actor(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.critic = Critic(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.target_actor = Actor(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.target_critic = Critic(state_size, action_size, config['seed']['agent'], l1, l2).to(device) self.noise = OUNoise(action_size, mu=config['noise']['mu'], sigma=config['noise']['sigma'], theta=config['noise']['theta']) # initialize targets same as original networks self.hard_update(self.target_actor, self.actor) self.hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=config['LR_ACTOR']) self.critic_optimizer = Adam(self.critic.parameters(), lr=config['LR_CRITIC']) def resetNoise(self): self.noise.reset() def act(self, obs, noise=0.0): action = self.actor(obs) + noise * self.noise.noise() action = np.clip(action.detach().numpy(), -1, 1) return action def target_act(self, obs, noise=0.0): action = self.target_actor(obs) + noise * self.noise.noise() return action def learn(self, experiences, gamma, tau): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.target_actor(next_states) Q_targets_next = self.target_critic(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) cl = critic_loss.cpu().detach().item() # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #from https://github.com/hortovanyi/DRLND-Continuous-Control/blob/master/ddpg_agent.py torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() al = actor_loss.cpu().detach().item() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic, self.target_critic, tau) self.soft_update(self.actor, self.target_actor, tau) return [al, cl] def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) # https://github.com/ikostrikov/pytorch-ddpg-naf/blob/master/ddpg.py#L15 def hard_update(self, target, source): """ Copy network parameters from source to target Inputs: target (torch.nn.Module): Net to copy parameters to source (torch.nn.Module): Net whose parameters to copy """ for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
In every generation, the population is evaluated, ranked, mutated, and re-instered into population ''' evo.evaluate_pop() evo.rank_pop_selection_mutation() print("Evolutionary Fitness = " + str(evo.best_policy.fitness)) ''' ############# The DDPG part ############# ''' state = torch.Tensor([env.reset()]) # algo line 6 ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() episode_reward = 0 for t in range(args.num_steps): # line 7 # forward pass through the actor network action = agent.select_action(state, ounoise) # line 8 next_state, reward, done, _ = env.step(action.numpy()[0]) # line 9 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward]) # if i_episode % 10 == 0: # env.render()
def main(): cfg = ConfigParser() cfg.read('config.ini') IP = cfg.get('server', 'ip') PORT = cfg.getint('server', 'port') FILE = cfg.get('file', 'file') SIZE = cfg.getint('env', 'buffer_size') TIME = cfg.getfloat('env', 'time') EPISODE = cfg.getint('env', 'episode') parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, metavar='G', help='discount factor for model (default: 0.001)') parser.add_argument('--noise_scale', type=float, default=0.3, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.3)') parser.add_argument('--exploration_end', type=int, default=100, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='number of hidden size (default: 128)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--updates_per_step', type=int, default=5, metavar='N', help='model updates per simulator step (default: 5)') parser.add_argument('--batch_size', type=int, default=64, metavar='N', help='batch size (default: 128)') sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((IP, PORT)) fd = sock.fileno() my_env = env(fd=fd, buff_size=SIZE, time=TIME, k=8, l=0.01, n=0.03, p=0.05) mpsched.persist_state(fd) args = parser.parse_args() agent = NAF_CNN(args.gamma, args.tau, args.hidden_size, my_env.observation_space.shape[0], my_env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(my_env.action_space.shape[0]) rewards = [] times = [] for i_episode in range(EPISODE): if (i_episode < 0.9 * EPISODE): # training io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE) io.start() state = my_env.reset() ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() print(state) episode_reward = 0 while True: state = torch.FloatTensor(state) #print("state: {}\n ounoise: {}".format(state, ounoise.scale)) action = agent.select_action(state, ounoise) #print("action: {}".format(action)) next_state, reward, count, recv_buff_size, done = my_env.step( action) #print("buff size: ",recv_buff_size) #print("reward: ", reward) episode_reward += reward action = torch.FloatTensor(action) mask = torch.Tensor([not done]) next_state = torch.FloatTensor(next_state) reward = torch.FloatTensor([float(reward)]) memory.push(state, action, mask, next_state, reward) state = next_state if len(memory) > args.batch_size * 5: for _ in range(args.updates_per_step): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) #print("update",10*'--') agent.update_parameters(batch) if done: break rewards.append(episode_reward) io.join() else: # testing io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE) io.start() state = my_env.reset() episode_reward = 0 start_time = time.time() while True: state = torch.FloatTensor(state) #print("state: {}\n".format(state)) action = agent.select_action(state) #print("action: {}".format(action)) next_state, reward, count, done = my_env.step(action) episode_reward += reward state = next_state if done: break rewards.append(episode_reward) times.append(str(time.time() - start_time) + "\n") io.join() #print("Episode: {}, noise: {}, reward: {}, average reward: {}".format(i_episode, ounoise.scale, rewards[-1], np.mean(rewards[-100:]))) fo = open("times.txt", "w") fo.writelines(lines) fo.close() sock.close()