def main(): my_env = env() agent = NAF_CNN(0.99, 0.001, 128, my_env.observation_space.shape[0], my_env.action_space) parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') parser.add_argument('--noise_scale', type=float, default=0.3, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.3)') parser.add_argument('--exploration_end', type=int, default=100, metavar='N', help='number of episodes with noise (default: 100)') args = parser.parse_args() ounoise = OUNoise(my_env.action_space.shape[0]) ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - 1) / args.exploration_end + args.final_noise_scale ounoise.reset() state = my_env.reset() i = 10 while i > 0: action = agent.select_action(state, ounoise) print("action: {}".format(action)) next_state, reward, done = my_env.step(action) if done: break print(reward) i = i - 1
''' Here, num_episodes correspond to the generations in Algo 1. In every generation, the population is evaluated, ranked, mutated, and re-instered into population ''' evo.evaluate_pop() evo.rank_pop_selection_mutation() print("Evolutionary Fitness = " + str(evo.best_policy.fitness)) ''' ############# The DDPG part ############# ''' state = torch.Tensor([env.reset()]) # algo line 6 ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() episode_reward = 0 for t in range(args.num_steps): # line 7 # forward pass through the actor network action = agent.select_action(state, ounoise) # line 8 next_state, reward, done, _ = env.step(action.numpy()[0]) # line 9 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward = torch.Tensor([reward])
def main(): global subdata t_start = time.time() parser = argparse.ArgumentParser(description='PyTorch X-job') parser.add_argument('--env_name', default="OurEnv-v0", help='name of the environment') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, help='discount factor for model (default: 0.001)') parser.add_argument('--ou_noise', type=bool, default=True) parser.add_argument('--noise_scale', type=float, default=0.4, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.4)') parser.add_argument('--exploration_end', type=int, default=33, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=512, metavar='N', help='batch size (default: 512)') parser.add_argument('--num_steps', type=int, default=300, metavar='N', help='max episode length (default: 1000)') parser.add_argument('--num_episodes', type=int, default=50, metavar='N', help='number of episodes (default: 1000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='hidden size (default: 128)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--save_agent', type=bool, default=True, help='save model to file') parser.add_argument('--load_agent', type=bool, default=False, help='load model from file') parser.add_argument('--train_model', type=bool, default=True, help='Training or run') parser.add_argument('--load_exp', type=bool, default=False, help='load saved experience') parser.add_argument('--state_plot', type=bool, default=True, help='plot Q values for environment') parser.add_argument('--greedy_steps', type=int, default=5, metavar='N', help='amount of times greedy goes (default: 100)') args = parser.parse_args() #env = gym.make(args.env_name) env = Env() #env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # -- initialize agent, Q and Q' -- agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) # -- declare memory buffer and random process N memory = ReplayMemory(args.replay_size) memory_g = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None # -- load existing model -- if args.load_agent: agent.load_model(args.env_name, args.batch_size, '.pth') print("agent: naf_{}_{}_{}, is loaded").format(args.env_name, args.batch_size, '.pth') # -- load experience buffer -- if args.load_exp: with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1', 'rb') as input: memory.memory = pickle.load(input) memory.position = len(memory) #sate_Q_plot(agent, 50) rewards = [] total_numsteps = 0 greedy_reward = [] avg_greedy_reward = [] upper_reward = [] lower_reward = [] steps_to_goal = [] avg_steps_to_goal = [] state_plot = [] sim_reset_start() pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10) rospy.Subscriber("/ee_rl/state", StateMsg, callback) rate = rospy.Rate(9) rate.sleep() for i_episode in range(args.num_episodes + 1): # -- reset environment for every episode -- sim_reset() state = torch.Tensor(subdata).unsqueeze(0) # -- initialize noise (random process N) -- if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode / args.exploration_end + args.final_noise_scale) ounoise.reset() episode_reward = 0 while True: # -- action selection, observation and store transition -- action = agent.select_action( state, ounoise) if args.train_model else agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, _ = env.calc_shaped_reward(next_state) total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) memory.push(state, action, mask, next_state, reward) # if done: # for i in range(total_numsteps % args.num_steps): # a = i+1 # memory_g.memory.append(memory.memory[-a]) # memory_g.position += 1 state = next_state #-- training -- # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model: # for _ in range(10): # transitions_b = memory.sample(args.batch_size/2) # transitions_g = memory_g.sample(args.batch_size/2) # for i in range(transitions_g): # transitions_b.append(transitions_g[i]) # batch = Transition(*zip(*transitions_b)) # agent.update_parameters(batch) if len(memory) > args.batch_size and args.train_model: for _ in range(10): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) agent.update_parameters(batch) else: time.sleep(0.1) rate.sleep() if done or total_numsteps % args.num_steps == 0: break pub.publish([0, 0]) rewards.append(episode_reward) # -- plot Q value -- if i_episode % 10 == 0: sate_Q_plot(agent, i_episode) # -- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) if args.train_model: greedy_episode = max(args.num_episodes / 100, 5) else: greedy_episode = 10 greedy_range = min(args.greedy_steps, greedy_episode) # -- calculates episode without noise -- if i_episode % greedy_episode == 0 and not i_episode == 0: for _ in range(0, greedy_range + 1): # -- reset environment for every episode -- sim_reset() state_visited = [] action_taken = [] print("Greedy episode ongoing") state = torch.Tensor(subdata).unsqueeze(0) episode_reward = 0 steps = 0 state_plot.append([]) st = state.numpy()[0] sta = [st[0], st[1]] state_plot[_].append(sta) while True: action = agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, obs_hit = env.calc_shaped_reward(next_state) episode_reward += reward state_visited.append(state) action_taken.append(action) state = next_state steps += 1 if done or steps == args.num_steps: greedy_reward.append(episode_reward) break rate.sleep() if obs_hit: steps = 300 steps_to_goal.append(steps) # -- plot path -- if i_episode % 10 == 0: agent.plot_path(state_visited, action_taken, i_episode) upper_reward.append((np.max(greedy_reward[-greedy_range:]))) lower_reward.append((np.min(greedy_reward[-greedy_range:]))) avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:]))) avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:]))) print( "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}" .format(i_episode, total_numsteps, avg_greedy_reward[-1], np.mean(rewards[-greedy_episode:]))) #-- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: # pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) print('Training ended after {} minutes'.format( (time.time() - t_start) / 60)) print('Time per ep : {} s').format( (time.time() - t_start) / args.num_episodes) print('Mean greedy reward: {}'.format(np.mean(greedy_reward))) print('Mean reward: {}'.format(np.mean(rewards))) print('Max reward: {}'.format(np.max(rewards))) print('Min reward: {}'.format(np.min(rewards))) # -- plot learning curve -- pos_greedy = [] for pos in range(0, len(lower_reward)): pos_greedy.append(pos * greedy_episode) plt.title('Greedy policy outcome') plt.fill_between(pos_greedy, lower_reward, upper_reward, facecolor='red', alpha=0.3) plt.plot(pos_greedy, avg_greedy_reward, 'r') plt.xlabel('Number of episodes') plt.ylabel('Rewards') fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname1) plt.close() plt.title('Steps to reach goal') plt.plot(steps_to_goal) plt.ylabel('Number of steps') plt.xlabel('Number of episodes') fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname2) plt.close()
memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() if args.param_noise and args.algo == "DDPG": agent.perturb_actor_parameters(param_noise) episode_reward = 0 while True: action = agent.select_action(state, ounoise, param_noise) next_state, reward, done, _ = env.step(action.numpy()[0]) total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state])
def fit_nash(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space) policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space) memory_vehicle = ReplayMemory(1000000) memory_attacker = ReplayMemory(1000000) memory_SL_vehicle = ReplayMemory(100000) memory_SL_attacker = ReplayMemory(100000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] eva_reward = [] ave_reward = [] eva_ac_veh = [] eva_ac_att = [] total_numsteps = 0 updates = 0 # while len(state_record) < 20: # s, _, _ = env.step(env.random_action()) # state_record.append(s) for i_episode in range(args.num_episodes): state = env.reset() if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle, param_noise_vehicle) action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker, param_noise_attacker) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()]) action_attacker = torch.Tensor([policy_attacker.predict(state.reshape(-1, 4)) / policy_attacker.predict( state.reshape(-1, 4)).sum()]) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0] next_state, reward, done = env.step(ac_v, ac_a) total_numsteps += 1 episode_reward += reward memory_SL_vehicle.append(state, ac_v) memory_SL_attacker.append(state, ac_a) action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([env.RC - reward]) memory_vehicle.push(torch.Tensor([[state]]), action_vehicle, mask, next_state, reward_vehicle) memory_attacker.push(torch.Tensor([[state]]), action_attacker, mask, next_state, reward_attacker) state = next_state.numpy()[0][0] if done: rewards.append(episode_reward) if i_episode % 100: print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) break if len(memory_vehicle) > args.batch_size: # 开始训练 # print('begin training') for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle) value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if i_episode % 10 == 0: state = env.reset() evaluate_reward = 0 while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor([[state]]), ounoise_vehicle, param_noise_vehicle) action_attacker = agent_attacker.select_action(torch.Tensor([[state]]), ounoise_attacker, param_noise_attacker) else: action_vehicle = torch.Tensor([policy_vehicle.predict( state.reshape(-1, 4)) / policy_vehicle.predict(state.reshape(-1, 4)).sum()]) action_attacker = torch.Tensor([policy_attacker.predict( state.reshape(-1, 4)) / policy_attacker.predict(state.reshape(-1, 4)).sum()]) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy()[0], action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy()[0], action_attacker.numpy()[0] next_state, reward, done = env.step(ac_v, ac_a) total_numsteps += 1 evaluate_reward += reward state = next_state[0] if done: average_reward = np.mean(rewards[-10:]) print("{} % Episode finished, total numsteps: {}, reward: {}, average reward: {}".format( i_episode / args.num_episodes * 100, total_numsteps, evaluate_reward, average_reward)) eva_reward.append(evaluate_reward) ave_reward.append(average_reward) print(ac_v[0]) eva_ac_veh.append((ac_v[0] + 1) / sum(ac_v[0] + 1)) eva_ac_att.append((ac_a[0] + 1) / sum(ac_a[0] + 1)) break # writer.add_scalar('reward/test', episode_reward, i_episode) env.close() f = plt.figure() plt.plot(eva_reward, label='Eva_reward') plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.show() AC_veh = np.array(eva_ac_veh) AC_att = np.array(eva_ac_att) # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_veh[:, 0], label='Bacon1') plt.plot(AC_veh[:, 1], label='Bacon2') plt.plot(AC_veh[:, 2], label='Bacon3') plt.plot(AC_veh[:, 3], label='Bacon4') # plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.savefig('Veh_result.png', ppi=300) plt.show()
def main(): parser = argparse.ArgumentParser(description='PyTorch X-job') parser.add_argument('--env_name', default="Pendulum-v0", help='name of the environment') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, help='discount factor for model (default: 0.001)') parser.add_argument('--ou_noise', type=bool, default=True) parser.add_argument('--noise_scale', type=float, default=0.4, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.4)') parser.add_argument('--exploration_end', type=int, default=33, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=200, metavar='N', help='batch size (default: 512)') parser.add_argument('--num_steps', type=int, default=100, metavar='N', help='max episode length (default: 300)') parser.add_argument('--num_episodes', type=int, default=5000, metavar='N', help='number of episodes (default: 5000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='hidden size (default: 128)') parser.add_argument('--updates_per_step', type=int, default=5, metavar='N', help='model updates per simulator step (default: 50)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--save_agent', type=bool, default=True, help='save model to file') parser.add_argument('--train_model', type=bool, default=True, help='Training or run') parser.add_argument('--load_agent', type=bool, default=False, help='load model from file') parser.add_argument('--load_exp', type=bool, default=False, help='load saved experience') parser.add_argument('--greedy_steps', type=int, default=10, metavar='N', help='amount of times greedy goes (default: 10)') args = parser.parse_args() env = ManipulateEnv() #env = gym.make(args.env_name) writer = SummaryWriter('runs/') env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # -- initialize agent -- agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) # -- declare memory buffer and random process N memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None # -- load existing model -- if args.load_agent: agent.load_model(args.env_name, args.batch_size, args.num_episodes, '.pth') print("agent: naf_{}_{}_{}_{}, is loaded".format( args.env_name, args.batch_size, args.num_episodes, '.pth')) # -- load experience buffer -- if args.load_exp: with open( '/home/quantao/Workspaces/catkin_ws/src/panda_demos/naf_env/src/exp_replay.pk1', 'rb') as input: memory.memory = pickle.load(input) memory.position = len(memory) rewards = [] total_numsteps = 0 updates = 0 #env.init_ros() #env.reset() t_start = time.time() for i_episode in range(args.num_episodes + 1): # -- reset environment for every episode -- #state = env.reset() state = torch.Tensor([env.reset()]) # -- initialize noise (random process N) -- if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode / args.exploration_end + args.final_noise_scale) ounoise.reset() episode_reward = 0 while True: # -- action selection, observation and store transition -- action = agent.select_action( state, ounoise) if args.train_model else agent.select_action(state) next_state, reward, done, info = env.step(action) #env.render() total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) next_state = torch.Tensor([next_state]) #print('reward:', reward) memory.push(state, action, mask, next_state, reward) state = next_state #else: # time.sleep(0.005) #env.render() #time.sleep(0.005) #env.rate.sleep() if done or total_numsteps % args.num_steps == 0: break if len(memory) >= args.batch_size and args.train_model: env.reset() print("Training model") for _ in range(args.updates_per_step * args.num_steps): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) value_loss, policy_loss = agent.update_parameters(batch) writer.add_scalar('loss/value', value_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 writer.add_scalar('reward/train', episode_reward, i_episode) print("Train Episode: {}, total numsteps: {}, reward: {}".format( i_episode, total_numsteps, episode_reward)) rewards.append(episode_reward) greedy_numsteps = 0 if i_episode % 10 == 0: #state = env.reset() state = torch.Tensor([env.reset()]) episode_reward = 0 while True: action = agent.select_action(state) next_state, reward, done, info = env.step(action) episode_reward += reward greedy_numsteps += 1 #state = next_state state = torch.Tensor([next_state]) #env.render() #time.sleep(0.01) # env.rate.sleep() if done or greedy_numsteps % args.num_steps == 0: break writer.add_scalar('reward/test', episode_reward, i_episode) rewards.append(episode_reward) print( "Episode: {}, total numsteps: {}, reward: {}, average reward: {}" .format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) #-- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, args.num_episodes, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) print('Training ended after {} minutes'.format( (time.time() - t_start) / 60)) print('Time per episode: {} s'.format( (time.time() - t_start) / args.num_episodes)) print('Mean reward: {}'.format(np.mean(rewards))) print('Max reward: {}'.format(np.max(rewards))) print('Min reward: {}'.format(np.min(rewards)))
def fit_nash(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space) policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space) memory_vehicle = ReplayMemory(1000000) memory_attacker = ReplayMemory(1000000) memory_SL_vehicle = ReplayMemory(100000) memory_SL_attacker = ReplayMemory(100000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] eva_reward = [] ave_reward = [] tra_ac_veh = [] tra_ac_att = [] All_reward=[] total_numsteps = 0 updates = 0 state_record = [env.reset()] # while len(state_record) < 20: # s, _, _ = env.step(*env.random_action()) # state_record.append(s) # print(torch.Tensor([state_record[-20:]]).shape) for i_episode in range(args.num_episodes): local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.array([b])) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: if random.random() < ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # print('sl', action_vehicle.shape) # print('sl', action_vehicle.shape) if is_cuda: ac_v, ac_a = action_vehicle.cpu().numpy(), action_attacker.cpu().numpy()[0] else: ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy() next_state, reward, done = env.step(ac_v, ac_a) # print('tra_reward', reward) # print(np.shape(state_record), next_state[0].shape) state_record.append(next_state) local_steps += 1 total_numsteps += 1 episode_steps += 1 episode_reward += reward # print('sl-mem',state.shape,ac_v.shape) # print('sl state mem', state.shape, ac_a.shape) memory_SL_vehicle.append(state_record[-1], ac_v) memory_SL_attacker.append(state_record[-1], ac_a) action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) # print(prev_state.shape, next_state.shape) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([env.RC - reward]) # print(state_record[-20:]) # print(torch.Tensor([state_record[-20:]]).shape) memory_vehicle.push(prev_state, action_vehicle, mask, next_state, reward_vehicle) memory_attacker.push(prev_state, action_attacker, mask, next_state, reward_attacker) state = next_state.numpy()[0] # print(state_record[-1].shape) if done: rewards.append(episode_reward) if i_episode % 100: print('Episode {} ends, local_steps {}. total_steps {}, instant ave-reward is {:.4f}'.format( i_episode, local_steps, total_numsteps, episode_reward)) break if len(memory_vehicle) > args.batch_size: # 开始训练 # print('begin training') for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) # print(batch_vehicle) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) value_loss_vehicle, policy_loss_vehicle = agent_vehicle.update_parameters(batch_vehicle) value_loss_attacker, policy_loss_attacker = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if i_episode % 10 == 0 and i_episode > 0: state = env.reset() state_record = [np.array([state])] while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.array([b])) local_steps += 1 state_record.append(s) evaluate_reward = 0 while True: # la = np.random.randint(0, len(state_record) - 20, 1)[0] if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor([policy_vehicle.predict( state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor([policy_attacker.predict( state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] ac_v, ac_a = action_vehicle.numpy(), action_attacker.numpy() next_state, reward, done = env.step(ac_v, ac_a) real_ac_v = ac_v[0].clip(-1, 1) + 1 tra_ac_veh.append(real_ac_v / (sum(real_ac_v) + 0.0000001)) tra_ac_att.append(ac_a[0]) state_record.append(next_state) total_numsteps += 1 local_steps += 1 # print('eva_reward', reward) evaluate_reward += reward state = next_state[0] if done: average_reward = np.mean(rewards[-10:]) print("{} % Episode finished, total numsteps: {}, eva-reward: {}, average reward: {}".format( i_episode / args.num_episodes * 100, total_numsteps, evaluate_reward, average_reward)) eva_reward.append(evaluate_reward) ave_reward.append(average_reward) # print(ac_v[0]) break # writer.add_scalar('reward/test', episode_reward, i_episode) env.close() df = pd.DataFrame() df['Eva'] = pd.Series(eva_reward) df['Tra'] = pd.Series(ave_reward) df2 = pd.DataFrame() df2['Weight'] = pd.Series(tra_ac_veh) df2['Attack'] = pd.Series(tra_ac_att) df.to_csv('./Result/reward_result_30.csv', index=None) df2.to_csv('./Result/action_result_30.csv', index=None) # np.savetxt('./Result/eva_result.csv', eva_reward, delimiter=',') # np.savetxt('./Result/ave_result.csv', ave_reward, delimiter=',') f = plt.figure() plt.plot(rewards[5:], label='Eva_reward') plt.show() AC_veh = np.array(tra_ac_veh) AC_att = np.array(tra_ac_att) # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_veh[:, 0], label='Bacon1', alpha=0.2) plt.plot(AC_veh[:, 1], label='Bacon2', alpha=0.2) plt.plot(AC_veh[:, 2], label='Bacon3', alpha=0.2) plt.plot(AC_veh[:, 3], label='Bacon4', alpha=0.2) # plt.plot(ave_reward, label='Tra_ave_reward') plt.legend() plt.savefig('./Result/Veh_result_30.png', ppi=300) plt.show() # print(AC_veh.shape) # print(AC_veh) plt.plot(AC_att[:, 0], label='Attack1', alpha=0.2) plt.plot(AC_att[:, 1], label='Attack2', alpha=0.2) plt.plot(AC_att[:, 2], label='Attack3', alpha=0.2) plt.plot(AC_att[:, 3], label='Attack4', alpha=0.2) # plt.plot(ave_reward, label='Tra_ave_reward') # plt.title('') plt.legend() plt.savefig('./Result/Att_result_30.png', ppi=300) plt.show()
def main(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space) agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space) vehicle_memory = ReplayMemory(1000000) attacker_memory = ReplayMemory(1000000) vehicle_ounoise = OUNoise(env.vehicle_action_space) if args.ou_noise else None attacker_ounoise = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([[env.reset()]]) # 4-dimensional velocity observation if args.ou_noise: vehicle_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale vehicle_ounoise.reset() attacker_ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale attacker_ounoise.reset() episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) total_numsteps += 1 episode_reward += reward action_vehicle = torch.Tensor(action_vehicle) action_attacker = torch.Tensor(action_attacker) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([-reward]) reward_attacker = torch.Tensor([env.RC+reward]) vehicle_memory.push(state, action_vehicle, mask, next_state, reward_vehicle) attacker_memory.push(state, action_attacker, mask, next_state, reward_attacker) state = next_state if len(vehicle_memory) > args.batch_size: for _ in range(args.updates_per_step): transitions_vehicle = vehicle_memory.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transition_attacker = attacker_memory.sample(args.batch_size) batch_attacker = Transition(*zip(*transition_attacker)) value_loss_1, policy_loss_1 = agent_vehicle.update_parameters(batch_vehicle) value_loss_2, policy_loss_2 = agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 if done: break # writer.add_scalar('reward/train', episode_reward, i_episode) # Update param_noise based on distance metric if args.param_noise: episode_transitions_vehicle = vehicle_memory.memory[vehicle_memory.position - t:vehicle_memory.position] states_vehicle = torch.cat([transition[0] for transition in episode_transitions_vehicle], 0) unperturbed_actions_vehicle = agent_vehicle.select_action(states_vehicle, None, None) perturbed_actions_vehicle = torch.cat([transition[1] for transition in episode_transitions_vehicle], 0) ddpg_dist_vehicle = ddpg_distance_metric(perturbed_actions_vehicle.numpy(), unperturbed_actions_vehicle.numpy()) param_noise_vehicle.adapt(ddpg_dist_vehicle) episode_transitions_attacker = attacker_memory.memory[attacker_memory.position - t:attacker_memory.position] states_attacker = torch.cat([transition[0] for transition in episode_transitions_attacker], 0) unperturbed_actions_attacker = agent_attacker.select_action(states_attacker, None, None) perturbed_actions_attacker = torch.cat([transition[1] for transition in episode_transitions_attacker], 0) ddpg_dist_attacker = ddpg_distance_metric(perturbed_actions_attacker.numpy(), unperturbed_actions_attacker.numpy()) param_noise_attacker.adapt(ddpg_dist_attacker) rewards.append(episode_reward) if i_episode % 10 == 0: state = torch.Tensor([[env.reset()]]) episode_reward = 0 while True: action_vehicle = agent_vehicle.select_action(state, vehicle_ounoise, param_noise_vehicle) action_attacker = agent_attacker.select_action(state, attacker_ounoise, param_noise_attacker) next_state, reward, done = env.step(action_vehicle.numpy()[0], action_attacker.numpy()[0]) episode_reward += reward next_state = torch.Tensor([[next_state]]) state = next_state if done: break # writer.add_scalar('reward/test', episode_reward, i_episode) rewards.append(episode_reward) print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) env.close()
def fit_nash(): suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode) # reward_file = open('reward' + suffix + '.txt', 'w') # attack_file = open('attacker_action' + suffix + '.txt', 'w') # weight_file = open('vehicle_weight' + suffix + '.txt', 'w') # distance_file = open('Distance' + suffix + '.txt', 'w') # reward_file.write(""" # Environment Initializing... # The initial head car velocity is {} # The initial safe distance is {} # The Nash Eq* Factor RC is {} # The Reward Calculation Mode is {} # The Attack Mode is {} # The Nash Mode is {} # """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash)) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space, 'veh') agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space, 'att') try: agent_vehicle.load_model('models/vehicle_' + suffix) print('Load vehicle RL model successfully') except: print('No existed vehicle RL model') try: agent_attacker.load_model('models/attacker_' + suffix) print('Load attacker RL model successfully') except: print('No existed attacker RL model') try: policy_vehicle = load_model('models/vehicle_' + suffix + '.h5') print('Load vehicle SL model successfully') except: policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle') try: policy_attacker = load_model('models/attacker_' + suffix + '.h5') print('Load attacker SL model successfully') except: policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker') print('*'*20, '\n\n\n') memory_vehicle = ReplayMemory(100000) memory_attacker = ReplayMemory(100000) memory_SL_vehicle = ReplayMemory(400000) memory_SL_attacker = ReplayMemory(400000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance']) reward_data = pd.DataFrame(columns=['Reward']) rewards = [] total_numsteps = 0 for i_episode in range(args.num_episodes): if i_episode % 100 == 0 and i_episode != 0: print('Writing to CSV files...') reward_data.to_csv(suffix + '.csv', index=False) res_data.to_csv(suffix + '.csv', index=False) if args.NashMode == 0: ETA = 0 elif args.NashMode == 1: ETA = 0.5 elif args.NashMode == 2: ETA = 0.1 - i_episode/args.num_episodes * 0.1 print('No.{} episode starts... ETA is {}'.format(i_episode, ETA)) # reward_file = open('reward' + suffix + '.txt', 'a') # attack_file = open('attacker_action' + suffix + '.txt', 'a') # weight_file = open('vehicle_weight' + suffix + '.txt', 'a') # distance_file = open('Distance' + suffix + '.txt', 'a') local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: sigma = random.random() if sigma > ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # 限制权重和为1 action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum()) action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker) res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}]) # 将处理的攻击值赋给原值 action_attacker = env.action_attacker total_numsteps += 1 episode_reward += reward state_record.append(next_state) local_steps += 1 episode_steps += 1 if sigma > ETA: memory_SL_vehicle.append(state_record[-1], action_vehicle) memory_SL_attacker.append(state_record[-1], action_attacker) action_vehicle = torch.Tensor(action_vehicle.reshape(1,4)) action_attacker = torch.Tensor(action_attacker.reshape(1,4)) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([RC - reward]) memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle) memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker) if done: rewards.append(episode_reward) print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) reward_data = reward_data.append([{'Reward': episode_reward}]) # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward)) break if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size: # 开始训练 for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) agent_vehicle.update_parameters(batch_vehicle) agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) if i_episode % 10 == 0 and i_episode != 0: eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance']) # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode)) state = env.reset() state_record = [np.array([state])] evaluate_reward = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0] action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum() action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2) eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}]) evaluate_reward += reward if done: print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, evaluate_reward, np.mean(rewards[-10:]))) # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode, # total_numsteps, # evaluate_reward, # np.mean(rewards[-10:]))) break # # writer.add_scalar('reward/test', episode_reward, i_episode) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() env.close() reward_data.to_csv(suffix+'_reward.csv', index=False) res_data.to_csv(suffix+'.csv', index=False) eva_res_data.to_csv(suffix+'_eva.csv', index=False) # save model agent_vehicle.save_model('vehicle_'+suffix) agent_attacker.save_model('attacker_'+suffix) policy_attacker.save('models/attacker_'+suffix+'.h5') policy_vehicle.save('models/vehicle_'+suffix+'.h5')
def main(): cfg = ConfigParser() cfg.read('config.ini') IP = cfg.get('server', 'ip') PORT = cfg.getint('server', 'port') FILE = cfg.get('file', 'file') SIZE = cfg.getint('env', 'buffer_size') TIME = cfg.getfloat('env', 'time') EPISODE = cfg.getint('env', 'episode') parser = argparse.ArgumentParser(description='PyTorch REINFORCE example') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, metavar='G', help='discount factor for model (default: 0.001)') parser.add_argument('--noise_scale', type=float, default=0.3, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.3)') parser.add_argument('--exploration_end', type=int, default=100, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='number of hidden size (default: 128)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--updates_per_step', type=int, default=5, metavar='N', help='model updates per simulator step (default: 5)') parser.add_argument('--batch_size', type=int, default=64, metavar='N', help='batch size (default: 128)') sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect((IP, PORT)) fd = sock.fileno() my_env = env(fd=fd, buff_size=SIZE, time=TIME, k=8, l=0.01, n=0.03, p=0.05) mpsched.persist_state(fd) args = parser.parse_args() agent = NAF_CNN(args.gamma, args.tau, args.hidden_size, my_env.observation_space.shape[0], my_env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(my_env.action_space.shape[0]) rewards = [] times = [] for i_episode in range(EPISODE): if (i_episode < 0.9 * EPISODE): # training io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE) io.start() state = my_env.reset() ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() print(state) episode_reward = 0 while True: state = torch.FloatTensor(state) #print("state: {}\n ounoise: {}".format(state, ounoise.scale)) action = agent.select_action(state, ounoise) #print("action: {}".format(action)) next_state, reward, count, recv_buff_size, done = my_env.step( action) #print("buff size: ",recv_buff_size) #print("reward: ", reward) episode_reward += reward action = torch.FloatTensor(action) mask = torch.Tensor([not done]) next_state = torch.FloatTensor(next_state) reward = torch.FloatTensor([float(reward)]) memory.push(state, action, mask, next_state, reward) state = next_state if len(memory) > args.batch_size * 5: for _ in range(args.updates_per_step): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) #print("update",10*'--') agent.update_parameters(batch) if done: break rewards.append(episode_reward) io.join() else: # testing io = io_thread(sock=sock, filename=FILE, buffer_size=SIZE) io.start() state = my_env.reset() episode_reward = 0 start_time = time.time() while True: state = torch.FloatTensor(state) #print("state: {}\n".format(state)) action = agent.select_action(state) #print("action: {}".format(action)) next_state, reward, count, done = my_env.step(action) episode_reward += reward state = next_state if done: break rewards.append(episode_reward) times.append(str(time.time() - start_time) + "\n") io.join() #print("Episode: {}, noise: {}, reward: {}, average reward: {}".format(i_episode, ounoise.scale, rewards[-1], np.mean(rewards[-100:]))) fo = open("times.txt", "w") fo.writelines(lines) fo.close() sock.close()