def __init__(self, C, b, x, action_output_num, actor_size, replay_size=1000000, ou_noise=True, param_noise=True, noise_scale=0.3, final_noise_scale=0.3): self.C = C self.b = b self.x = x self.hd = action_output_num self.actor_size = actor_size self.memory = ReplayMemory(replay_size) self.new_b = None self.env = None self.agent = None self.ou_noise = ou_noise self.noise_scale = noise_scale self.final_noise_scale = final_noise_scale self.ounoise = OUNoise(action_output_num) if ou_noise else None self.param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=noise_scale, adaptation_coefficient=1.05) if param_noise else None
env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset()
writer = SummaryWriter() env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() if args.param_noise and args.algo == "DDPG": agent.perturb_actor_parameters(param_noise)
def fit_nash(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space) policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space) memory_vehicle = ReplayMemory(1000000) memory_attacker = ReplayMemory(1000000) memory_SL_vehicle = ReplayMemory(100000) memory_SL_attacker = ReplayMemory(100000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] eva_reward = [] ave_reward = [] eva_ac_veh = [] eva_ac_att = [] total_numsteps = 0 updates = 0 # while len(state_record) < 20: # s, _, _ = env.step(env.random_action()) # state_record.append(s) for i_episode in range(args.num_episodes): state = env.reset() if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle, param_noise_vehicle) action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker, param_noise_attacker) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()]) action_attacker = torch.Tensor([policy_attacker.predict(state.reshape(-1, 4)) / policy_attacker.predict( state.reshape(-1, 4)).sum()]) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0] next_state, reward, done = env.step(ac_v, ac_a) total_numsteps += 1 episode_reward += reward memory_SL_vehicle.append(state, ac_v) memory_SL_attacker.append(state, ac_a) action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([env.RC - reward]) memory_vehicle.push(torch.Tensor([[state]]), action_vehicle, mask, next_state, reward_vehicle) memory_attacker.push(torch.Tensor([[state]]), action_attacker, mask, next_state, reward_attacker) state = next_state.numpy()[0][0] if done: rewards.append(episode_reward) if i_episode % 100: print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) break if len(memory_vehicle) > args.batch_size: # 开始训练 # print('begin training') for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle) value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if i_episode % 10 == 0: state = env.reset() evaluate_reward = 0 while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle, param_noise_vehicle) action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker, param_noise_attacker) else: action_vehicle = torch.Tensor([policy_vehicle.predict( state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()]) action_attacker = torch.Tensor([policy_attacker.predict( state.reshape(-1, 4)) / policy_attacker.predict(state.reshape(-1, 4)).sum()]) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0] next_state, reward, done = env.step(ac_v, ac_a) total_numsteps += 1 evaluate_reward += reward state = next_state[0] if done: average_reward = np.mean(rewards[-10:]) print("{} % Episode finished, total numsteps: {}, reward: {}, average reward: {}".format( i_episode / args.num_episodes * 100, total_numsteps, evaluate_reward, average_reward)) eva_reward.append(evaluate_reward) ave_reward.append(average_reward) print(ac_v[0]) eva_ac_veh.append((ac_v[0] + 1) / sum(ac_v[0] + 1)) eva_ac_att.append((ac_a[0] + 1) / sum(ac_a[0] + 1)) break # writer.add_scalar('reward/test', episode_reward, i_episode) env.close() f = plt.figure() plt.plot(eva_reward, label='Eva_reward') plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.show() AC_veh = np.array(eva_ac_veh) AC_att = np.array(eva_ac_att) # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_veh[:, 0], label='Bacon1') plt.plot(AC_veh[:, 1], label='Bacon2') plt.plot(AC_veh[:, 2], label='Bacon3') plt.plot(AC_veh[:, 3], label='Bacon4') # plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.savefig('Veh_result.png', ppi=300) plt.show()
class DDPGB(object): # x是数值向量 # b是码值向量 # c是标准codebook矩阵 # action_output_num是码值输出的维度 # replay_size是meomery队列的最大长度 # new_b代表新计算得到的b # env代表action和obersevation的产生环境 # agent代表实际的ddpg执行体 # 保留这些噪声参数只是为了能够进入到需要随机探索的部分 def __init__(self, C, b, x, action_output_num, actor_size, replay_size=1000000, ou_noise=True, param_noise=True, noise_scale=0.3, final_noise_scale=0.3): self.C = C self.b = b self.x = x self.hd = action_output_num self.actor_size = actor_size self.memory = ReplayMemory(replay_size) self.new_b = None self.env = None self.agent = None self.ou_noise = ou_noise self.noise_scale = noise_scale self.final_noise_scale = final_noise_scale self.ounoise = OUNoise(action_output_num) if ou_noise else None self.param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=noise_scale, adaptation_coefficient=1.05) if param_noise else None def update_B(self, c, b, x): self.C = c self.b = b self.x = x # 备选coff代表reward中的权重比例[0.2, 0.8] def generate_B(self, coff, gamma, tau, hidden_size, num_inputs, actor_size, num_episodes=60000, exploration_end=150, batch_size=512, updates_per_step=5000): self.env = QuantizationEnv(self.C, self.b, self.x, self.hd, coff) self.agent = DDPG(gamma, tau, hidden_size, self.env.action_bin, num_inputs, actor_size) rewards = [] total_numsteps = 0 updates = 0 max_trail = 10000 best_bb = 10000 # 开启num_episodes次最佳方案寻找 for i_episode in range(num_episodes): state = torch.Tensor([self.env.reset()]) if self.ou_noise: self.ounoise.scale = (self.noise_scale - self.final_noise_scale) * max(0, exploration_end - i_episode) \ / exploration_end + self.final_noise_scale self.ounoise.reset() if self.param_noise: self.agent.perturb_actor_parameters(self.param_noise) episode_reward = 0 continuous_neg = 0 continuous_pos = 0 temp_trail = 0 control_bit = 0 next_state = self.env.compute_Cbx(self.b) next_state = torch.Tensor([next_state]) while True: # yyj if control_bit > 15: control_bit = control_bit % 16 state = next_state action = self.agent.select_action(state, self.ounoise, self.param_noise) next_state, reward, done, bb = self.env.step( action, control_bit, self.actor_size) # print(control_bit, next_state[0], reward, done, bb) control_bit = control_bit + 1 total_numsteps += 1 episode_reward += reward # bb是c_v值 if best_bb > bb: best_bb = bb self.new_b = action if reward > 0: continuous_pos += 1 continuous_neg = 0 if continuous_pos > 10: done = True if reward < 0: continuous_neg += 1 continuous_pos = 0 if continuous_neg > 10: done = True if temp_trail > max_trail: done = True action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward]) self.memory.push(state, action, mask, next_state, reward) # state = next_state temp_trail += 1 # memorysize还不够,不会进入 if len(self.memory) > batch_size: for _ in range(updates_per_step): transitions = self.memory.sample(1) batch = Transition(*zip(*transitions)) # value_loss属于右边的网络,policy_loss属于左边的网络 value_loss, policy_loss = self.agent.update_parameters( batch) print("epoch:", i_episode, "updates", updates, "value_loss:", value_loss, " policy_loss:", policy_loss) updates += 1 if done: break if self.param_noise: episode_transitions = self.memory.memory[self.memory.position - batch_size:self. memory.position] states = torch.cat( [transition[0] for transition in episode_transitions], 0) unperturbed_actions = self.agent.select_action( states, None, None) perturbed_actions = torch.cat( [transition[1] for transition in episode_transitions], 0) ddpg_dist = ddpg_distance_metric(perturbed_actions.numpy(), unperturbed_actions.numpy()) self.param_noise.adapt(ddpg_dist) rewards.append(episode_reward) continuous_neg = 0 continuous_pos = 0 temp_trail = 0 if i_episode % 10 == 0 and i_episode != 0: state = torch.Tensor([self.env.reset()]) episode_reward = 0 control_bit = 0 while True: action = self.agent.select_action(state) next_state, reward, done, bb = self.env.step( action.numpy()[0], control_bit) episode_reward += reward if best_bb > bb: best_bb = bb self.new_b = action if reward > 0: continuous_pos += 1 continuous_neg = 0 if continuous_pos > 10: done = True if reward < 0: continuous_neg += 1 continuous_pos = 0 if continuous_neg > 10: done = True if temp_trail > max_trail: done = True next_state = torch.Tensor([next_state]) state = next_state temp_trail += 1 if done: break rewards.append(episode_reward) print( "Episode: {}, total numsteps: {}, reward: {}, average reward: {}" .format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) return self.new_b
def build_agent(OD, OD_tensor, args): '''Build model''' build_func = get_build_func(OD_tensor, args) nb_regions = OD.shape[-1] nb_actions = get_nb_actions(args.action_mode, nb_regions) def get_prob_imitation(steps): if steps < args.prob_imitation_steps: p = (1 - 1 / (1 + np.exp( (-steps / args.prob_imitation_steps + 0.5) * 10)) ) * args.base_prob_imitation else: p = 0 return max(p, args.min_prob_imitation) def get_std_adapt(): if args.std_adapt_steps <= 0: return None def std_adapt(steps): if steps < args.std_adapt_steps: return 1 - 1 / (1 + np.exp( (-steps / args.std_adapt_steps + 0.5) * 10)) else: return 0 return std_adapt memory = SequentialMemory(limit=args.memory_limit, window_length=1) if args.action_noise == True: random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.rd_theta, mu=0, sigma=args.rd_sigma, dt=args.rd_dt) else: random_process = None if args.param_noise == True: param_noise = AdaptiveParamNoiseSpec( initial_stddev=args.init_std, desired_action_stddev=args.action_std, adoption_coefficient=args.adapt, min_action_std=args.min_action_std, std_adapt=get_std_adapt()) else: param_noise = None agent = DDPGAgent(nb_actions=nb_actions, build_func=build_func, nb_regions=nb_regions, start_step=args.start_step, memory=memory, nb_steps_warmup_critic=args.warmup_steps, nb_steps_warmup_actor=args.warmup_steps, exp_policy=get_exp_policy(OD, args), batch_size=args.batch_size, param_noise=param_noise, get_prob_imitation=get_prob_imitation, train_interval=args.train_interval, random_process=random_process, gamma=args.decay, target_model_update=args.update, delta_clip=args.delta_clip) agent.compile(eval(args.optimizer)(lr=args.lr, clipnorm=1.), metrics=['mae']) return agent
if args.flip_ratio: base_dir += 'flip_ratio_' base_dir += 'alternative_' + args.method + '_' + str(args.alpha) + '_' + str(args.ratio) + '/' else: base_dir += 'alternative_non_robust_' + args.exploration_method + '_' + str(args.alpha) + '_' + str(args.ratio) + '/' run_number = 0 while os.path.exists(base_dir + str(run_number)): run_number += 1 base_dir = base_dir + str(run_number) os.makedirs(base_dir) ounoise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(env.action_space.shape[0]), sigma=float(args.noise_scale) * np.ones(env.action_space.shape[0]) ) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec(initial_stddev=args.noise_scale, desired_action_stddev=args.noise_scale) if args.param_noise else None def reset_noise(a, a_noise, p_noise): if a_noise is not None: a_noise.reset() if p_noise is not None: a.perturb_actor_parameters(param_noise) total_steps = 0 print(base_dir) if args.num_steps is not None: assert args.num_epochs is None nb_epochs = int(args.num_steps) // (args.num_epochs_cycles * args.num_rollout_steps)
def fit_nash(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space) policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space) memory_vehicle = ReplayMemory(1000000) memory_attacker = ReplayMemory(1000000) memory_SL_vehicle = ReplayMemory(100000) memory_SL_attacker = ReplayMemory(100000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] eva_reward = [] ave_reward = [] tra_ac_veh = [] tra_ac_att = [] All_reward=[] total_numsteps = 0 updates = 0 state_record = [env.reset()] # while len(state_record) < 20: # s, _, _ = env.step(*env.random_action()) # state_record.append(s) # print(torch.Tensor([state_record[-20:]]).shape) for i_episode in range(args.num_episodes): local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.array([b])) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: if random.random() < ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # print('sl', action_vehicle.shape) # print('sl', action_vehicle.shape) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy(), action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy() next_state, reward, done = env.step(ac_v, ac_a) # print('tra_reward', reward) # print(np.shape(state_record), next_state[0].shape) state_record.append(next_state) local_steps += 1 total_numsteps += 1 episode_steps += 1 episode_reward += reward # print('sl-mem',state.shape,ac_v.shape) # print('sl state mem', state.shape, ac_a.shape) memory_SL_vehicle.append(state_record[-1], ac_v) memory_SL_attacker.append(state_record[-1], ac_a) action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) # print(prev_state.shape, next_state.shape) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([env.RC - reward]) # print(state_record[-20:]) # print(torch.Tensor([state_record[-20:]]).shape) memory_vehicle.push(prev_state, action_vehicle, mask, next_state, reward_vehicle) memory_attacker.push(prev_state, action_attacker, mask, next_state, reward_attacker) state = next_state.numpy()[0] # print(state_record[-1].shape) if done: rewards.append(episode_reward) if i_episode % 100: print('Episode {} ends, local_steps {}. total_steps {}, instant ave-reward is {:.4f}'.format( i_episode, local_steps, total_numsteps, episode_reward)) break if len(memory_vehicle) > args.batch_size: # 开始训练 # print('begin training') for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) # print(batch_vehicle) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle) value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if i_episode % 10 == 0 and i_episode > 0: state = env.reset() state_record = [np.array([state])] while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.array([b])) local_steps += 1 state_record.append(s) evaluate_reward = 0 while True: # la = np.random.randint(0, len(state_record) - 20, 1)[0] if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor([policy_vehicle.predict( state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor([policy_attacker.predict( state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy() next_state, reward, done = env.step(ac_v, ac_a) real_ac_v = ac_v[0].clip(-1, 1) + 1 tra_ac_veh.append(real_ac_v / (sum(real_ac_v) + 0.0000001)) tra_ac_att.append(ac_a[0]) state_record.append(next_state) total_numsteps += 1 local_steps += 1 # print('eva_reward', reward) evaluate_reward += reward state = next_state[0] if done: average_reward = np.mean(rewards[-10:]) print("{} % Episode finished, total numsteps: {}, eva-reward: {}, average reward: {}".format( i_episode / args.num_episodes * 100, total_numsteps, evaluate_reward, average_reward)) eva_reward.append(evaluate_reward) ave_reward.append(average_reward) # print(ac_v[0]) break # writer.add_scalar('reward/test', episode_reward, i_episode) env.close() df = pd.DataFrame() df['Eva'] = pd.Series(eva_reward) df['Tra'] = pd.Series(ave_reward) df2 = pd.DataFrame() df2['Weight'] = pd.Series(tra_ac_veh) df2['Attack'] = pd.Series(tra_ac_att) df.to_csv('./Result/reward_result_30.csv', index=None) df2.to_csv('./Result/action_result_30.csv', index=None) # np.savetxt('./Result/eva_result.csv', eva_reward, delimiter=',') # np.savetxt('./Result/ave_result.csv', ave_reward, delimiter=',') f = plt.figure() plt.plot(rewards[5:], label='Eva_reward') plt.show() AC_veh = np.array(tra_ac_veh) AC_att = np.array(tra_ac_att) # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_veh[:, 0], label='Bacon1', alpha=0.2) plt.plot(AC_veh[:, 1], label='Bacon2', alpha=0.2) plt.plot(AC_veh[:, 2], label='Bacon3', alpha=0.2) plt.plot(AC_veh[:, 3], label='Bacon4', alpha=0.2) # plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.savefig('./Result/Veh_result_30.png', ppi=300) plt.show() # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_att[:, 0], label='Attack1', alpha=0.2) plt.plot(AC_att[:, 1], label='Attack2', alpha=0.2) plt.plot(AC_att[:, 2], label='Attack3', alpha=0.2) plt.plot(AC_att[:, 3], label='Attack4', alpha=0.2) # plt.plot(ave_reward, label='Tra_ave_reward') # plt.title('') plt.legend() plt.savefig('./Result/Att_result_30.png', ppi=300) plt.show()
def main(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) vehicle_memory = ReplayMemory(1000000) attacker_memory = ReplayMemory(1000000) vehicle_ounoise = OUNoise(env.vehicle_action_space) if args.ou_noise else None attacker_ounoise = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([[env.reset()]]) # 4-dimensional velocity observation if args.ou_noise: vehicle_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale vehicle_ounoise.reset() attacker_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale attacker_ounoise.reset() episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) total_numsteps += 1 episode_reward += reward action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([-reward]) reward_attacker = torch.Tensor([env.RC+reward]) vehicle_memory.push(state, action_vehicle, mask, next_state, reward_vehicle) attacker_memory.push(state, action_attacker, mask, next_state, reward_attacker) state = next_state if len(vehicle_memory) > args.batch_size: for _ in range(args.updates_per_step): transitions_vehicle = vehicle_memory.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transition_attacker = attacker_memory.sample(args.batch_size) batch_attacker = Transition(*zip(*transition_attacker)) value_loss_1, policy_loss_1 = agent_vehicle.update_parameters(batch_vehicle) value_loss_2, policy_loss_2 = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if done: break # writer.add_scalar('reward/train', episode_reward, i_episode) # Update param_noise based on distance metric if args.param_noise: episode_transitions_vehicle = vehicle_memory.memory[vehicle_memory.position - t:vehicle_memory.position] states_vehicle = torch.cat([transition[0] for transition in episode_transitions_vehicle], 0) unperturbed_actions_vehicle = agent_vehicle.select_action(states_vehicle, None, None) perturbed_actions_vehicle = torch.cat([transition[1] for transition in episode_transitions_vehicle], 0) ddpg_dist_vehicle = ddpg_distance_metric(perturbed_actions_vehicle.numpy(), unperturbed_actions_vehicle.numpy()) param_noise_vehicle.adapt(ddpg_dist_vehicle) episode_transitions_attacker = attacker_memory.memory[attacker_memory.position - t:attacker_memory.position] states_attacker = torch.cat([transition[0] for transition in episode_transitions_attacker], 0) unperturbed_actions_attacker = agent_attacker.select_action(states_attacker, None, None) perturbed_actions_attacker = torch.cat([transition[1] for transition in episode_transitions_attacker], 0) ddpg_dist_attacker = ddpg_distance_metric(perturbed_actions_attacker.numpy(), unperturbed_actions_attacker.numpy()) param_noise_attacker.adapt(ddpg_dist_attacker) rewards.append(episode_reward) if i_episode % 10 == 0: state = torch.Tensor([[env.reset()]]) episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) episode_reward += reward next_state = torch.Tensor([[next_state]]) state = next_state if done: break # writer.add_scalar('reward/test', episode_reward, i_episode) rewards.append(episode_reward) print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) env.close()
def fit_nash(): suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode) # reward_file = open('reward' + suffix + '.txt', 'w') # attack_file = open('attacker_action' + suffix + '.txt', 'w') # weight_file = open('vehicle_weight' + suffix + '.txt', 'w') # distance_file = open('Distance' + suffix + '.txt', 'w') # reward_file.write(""" # Environment Initializing... # The initial head car velocity is {} # The initial safe distance is {} # The Nash Eq* Factor RC is {} # The Reward Calculation Mode is {} # The Attack Mode is {} # The Nash Mode is {} # """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash)) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space, 'veh') agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space, 'att') try: agent_vehicle.load_model('models/vehicle_' + suffix) print('Load vehicle RL model successfully') except: print('No existed vehicle RL model') try: agent_attacker.load_model('models/attacker_' + suffix) print('Load attacker RL model successfully') except: print('No existed attacker RL model') try: policy_vehicle = load_model('models/vehicle_' + suffix + '.h5') print('Load vehicle SL model successfully') except: policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle') try: policy_attacker = load_model('models/attacker_' + suffix + '.h5') print('Load attacker SL model successfully') except: policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker') print('*'*20, '\n\n\n') memory_vehicle = ReplayMemory(100000) memory_attacker = ReplayMemory(100000) memory_SL_vehicle = ReplayMemory(400000) memory_SL_attacker = ReplayMemory(400000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance']) reward_data = pd.DataFrame(columns=['Reward']) rewards = [] total_numsteps = 0 for i_episode in range(args.num_episodes): if i_episode % 100 == 0 and i_episode != 0: print('Writing to CSV files...') reward_data.to_csv(suffix + '.csv', index=False) res_data.to_csv(suffix + '.csv', index=False) if args.NashMode == 0: ETA = 0 elif args.NashMode == 1: ETA = 0.5 elif args.NashMode == 2: ETA = 0.1 - i_episode/args.num_episodes * 0.1 print('No.{} episode starts... ETA is {}'.format(i_episode, ETA)) # reward_file = open('reward' + suffix + '.txt', 'a') # attack_file = open('attacker_action' + suffix + '.txt', 'a') # weight_file = open('vehicle_weight' + suffix + '.txt', 'a') # distance_file = open('Distance' + suffix + '.txt', 'a') local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: sigma = random.random() if sigma > ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # 限制权重和为1 action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum()) action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker) res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}]) # 将处理的攻击值赋给原值 action_attacker = env.action_attacker total_numsteps += 1 episode_reward += reward state_record.append(next_state) local_steps += 1 episode_steps += 1 if sigma > ETA: memory_SL_vehicle.append(state_record[-1], action_vehicle) memory_SL_attacker.append(state_record[-1], action_attacker) action_vehicle = torch.Tensor(action_vehicle.reshape(1,4)) action_attacker = torch.Tensor(action_attacker.reshape(1,4)) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([RC - reward]) memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle) memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker) if done: rewards.append(episode_reward) print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) reward_data = reward_data.append([{'Reward': episode_reward}]) # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward)) break if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size: # 开始训练 for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) agent_vehicle.update_parameters(batch_vehicle) agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) if i_episode % 10 == 0 and i_episode != 0: eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance']) # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode)) state = env.reset() state_record = [np.array([state])] evaluate_reward = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0] action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum() action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2) eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}]) evaluate_reward += reward if done: print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, evaluate_reward, np.mean(rewards[-10:]))) # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode, # total_numsteps, # evaluate_reward, # np.mean(rewards[-10:]))) break # # writer.add_scalar('reward/test', episode_reward, i_episode) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() env.close() reward_data.to_csv(suffix+'_reward.csv', index=False) res_data.to_csv(suffix+'.csv', index=False) eva_res_data.to_csv(suffix+'_eva.csv', index=False) # save model agent_vehicle.save_model('vehicle_'+suffix) agent_attacker.save_model('attacker_'+suffix) policy_attacker.save('models/attacker_'+suffix+'.h5') policy_vehicle.save('models/vehicle_'+suffix+'.h5')