def prepare_naf(env, args): return NAF(state_dim=env.observation_space.shape[0], action_num=env.action_space.shape[0], lr=args.learning_rate, batch_size=args.batch_size, device=args.gpu, shared_model=args.parameter_shared_model, clip_grads=args.clip_grads, use_batch_norm=args.use_batch_norm, double_q=args.double_q)
def play(checkpoint): env = gym.make('Pendulum-v0') env.reset() env.render() status_dim = env.observation_space.shape action_dim = env.action_space.shape naf_cfg = easydict.EasyDict( dict(status_dim=status_dim, action_dim=action_dim, hidden_dim=200, learning_rate=0.0003, l2_reg=0.0003, is_training=False)) tf.reset_default_graph() sess = tf.InteractiveSession() # sess.run(tf.global_variables_initializer()) estimator = NAF(naf_cfg) saver = tf.train.Saver() saver.restore(sess, checkpoint) while 1: state = env.reset() episode_reward = 0 env.render() for i in itertools.count(1): action = estimator.predict(np.expand_dims(state, 0))[0] action = np.minimum(env.action_space.high, np.maximum(env.action_space.low, action)) next_state, reward, done, _ = env.step(action) env.render() time.sleep(0.01) episode_reward += reward if i > 200 or done: print('episode reward:', episode_reward) break else: state = next_state sess.close()
device = torch.device("cuda:0") torch.cuda.manual_seed(args.seed) else: device = torch.device("cpu") torch.manual_seed(args.seed) np.random.seed(args.seed) obs_shape = env.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if len(env.observation_space.shape) == 3: image_input = True if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, obs_shape, env.action_space, image_input) if torch.cuda.device_count() > 1: agent.model = nn.DataParallel(agent.model) agent.target_model = nn.DataParallel(agent.target_model) agent.model.to(device) agent.target_model.to(device) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, obs_shape, env.action_space, image_input) if torch.cuda.device_count() > 1: import torch.nn as nn agent.actor = nn.DataParallel(agent.actor) agent.actor_target = nn.DataParallel(agent.actor_target) agent.actor_perturbed = nn.DataParallel(agent.actor_perturbed) agent.critic = nn.DataParallel(agent.critic) agent.critic_target = nn.DataParallel(agent.critic_target)
parse_arguments() args = parser.parse_args() args.env_name = "Springmass-v0" print("Running environment" + str(args.env_name)) env = NormalizedActions(gym.make(args.env_name)) # env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(args.env_name), force=True) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) ''' DEFINE THE ACTOR RL AGENT ''' if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) print("Initialized NAF") else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) print("Initialized DDPG actor") ''' DEFINE REPLAY BUFFER AND NOISE ''' memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) ''' ############################# Initialize the Evolution Part ############################# '''
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') args = parser.parse_args() env = NormalizedActions(gym.make(args.env_name)) writer = SummaryWriter() env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0
def main(): global subdata t_start = time.time() parser = argparse.ArgumentParser(description='PyTorch X-job') parser.add_argument('--env_name', default="OurEnv-v0", help='name of the environment') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, help='discount factor for model (default: 0.001)') parser.add_argument('--ou_noise', type=bool, default=True) parser.add_argument('--noise_scale', type=float, default=0.4, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.4)') parser.add_argument('--exploration_end', type=int, default=33, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=512, metavar='N', help='batch size (default: 512)') parser.add_argument('--num_steps', type=int, default=300, metavar='N', help='max episode length (default: 1000)') parser.add_argument('--num_episodes', type=int, default=50, metavar='N', help='number of episodes (default: 1000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='hidden size (default: 128)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--save_agent', type=bool, default=True, help='save model to file') parser.add_argument('--load_agent', type=bool, default=False, help='load model from file') parser.add_argument('--train_model', type=bool, default=True, help='Training or run') parser.add_argument('--load_exp', type=bool, default=False, help='load saved experience') parser.add_argument('--state_plot', type=bool, default=True, help='plot Q values for environment') parser.add_argument('--greedy_steps', type=int, default=5, metavar='N', help='amount of times greedy goes (default: 100)') args = parser.parse_args() #env = gym.make(args.env_name) env = Env() #env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # -- initialize agent, Q and Q' -- agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) # -- declare memory buffer and random process N memory = ReplayMemory(args.replay_size) memory_g = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None # -- load existing model -- if args.load_agent: agent.load_model(args.env_name, args.batch_size, '.pth') print("agent: naf_{}_{}_{}, is loaded").format(args.env_name, args.batch_size, '.pth') # -- load experience buffer -- if args.load_exp: with open('/home/aass/catkin_workspace/src/panda_demos/exp_replay.pk1', 'rb') as input: memory.memory = pickle.load(input) memory.position = len(memory) #sate_Q_plot(agent, 50) rewards = [] total_numsteps = 0 greedy_reward = [] avg_greedy_reward = [] upper_reward = [] lower_reward = [] steps_to_goal = [] avg_steps_to_goal = [] state_plot = [] sim_reset_start() pub = rospy.Publisher('/ee_rl/act', DesiredErrorDynamicsMsg, queue_size=10) rospy.Subscriber("/ee_rl/state", StateMsg, callback) rate = rospy.Rate(9) rate.sleep() for i_episode in range(args.num_episodes + 1): # -- reset environment for every episode -- sim_reset() state = torch.Tensor(subdata).unsqueeze(0) # -- initialize noise (random process N) -- if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode / args.exploration_end + args.final_noise_scale) ounoise.reset() episode_reward = 0 while True: # -- action selection, observation and store transition -- action = agent.select_action( state, ounoise) if args.train_model else agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, _ = env.calc_shaped_reward(next_state) total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) memory.push(state, action, mask, next_state, reward) # if done: # for i in range(total_numsteps % args.num_steps): # a = i+1 # memory_g.memory.append(memory.memory[-a]) # memory_g.position += 1 state = next_state #-- training -- # if len(memory_g) > args.batch_size / 2 and len(memory) > args.batch_size/2 and args.train_model: # for _ in range(10): # transitions_b = memory.sample(args.batch_size/2) # transitions_g = memory_g.sample(args.batch_size/2) # for i in range(transitions_g): # transitions_b.append(transitions_g[i]) # batch = Transition(*zip(*transitions_b)) # agent.update_parameters(batch) if len(memory) > args.batch_size and args.train_model: for _ in range(10): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) agent.update_parameters(batch) else: time.sleep(0.1) rate.sleep() if done or total_numsteps % args.num_steps == 0: break pub.publish([0, 0]) rewards.append(episode_reward) # -- plot Q value -- if i_episode % 10 == 0: sate_Q_plot(agent, i_episode) # -- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: #pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) if args.train_model: greedy_episode = max(args.num_episodes / 100, 5) else: greedy_episode = 10 greedy_range = min(args.greedy_steps, greedy_episode) # -- calculates episode without noise -- if i_episode % greedy_episode == 0 and not i_episode == 0: for _ in range(0, greedy_range + 1): # -- reset environment for every episode -- sim_reset() state_visited = [] action_taken = [] print("Greedy episode ongoing") state = torch.Tensor(subdata).unsqueeze(0) episode_reward = 0 steps = 0 state_plot.append([]) st = state.numpy()[0] sta = [st[0], st[1]] state_plot[_].append(sta) while True: action = agent.select_action(state) a = action.numpy()[0] * 50 act_pub = [a[0], a[1]] pub.publish(act_pub) next_state = torch.Tensor(subdata).unsqueeze(0) reward, done, obs_hit = env.calc_shaped_reward(next_state) episode_reward += reward state_visited.append(state) action_taken.append(action) state = next_state steps += 1 if done or steps == args.num_steps: greedy_reward.append(episode_reward) break rate.sleep() if obs_hit: steps = 300 steps_to_goal.append(steps) # -- plot path -- if i_episode % 10 == 0: agent.plot_path(state_visited, action_taken, i_episode) upper_reward.append((np.max(greedy_reward[-greedy_range:]))) lower_reward.append((np.min(greedy_reward[-greedy_range:]))) avg_greedy_reward.append((np.mean(greedy_reward[-greedy_range:]))) avg_steps_to_goal.append((np.mean(steps_to_goal[-greedy_range:]))) print( "Episode: {}, total numsteps: {}, avg_greedy_reward: {}, average reward: {}" .format(i_episode, total_numsteps, avg_greedy_reward[-1], np.mean(rewards[-greedy_episode:]))) #-- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, i_episode, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) #with open('exp_replay_g.pk1', 'wb') as output: # pickle.dump(memory_g.memory, output, pickle.HIGHEST_PROTOCOL) print('Training ended after {} minutes'.format( (time.time() - t_start) / 60)) print('Time per ep : {} s').format( (time.time() - t_start) / args.num_episodes) print('Mean greedy reward: {}'.format(np.mean(greedy_reward))) print('Mean reward: {}'.format(np.mean(rewards))) print('Max reward: {}'.format(np.max(rewards))) print('Min reward: {}'.format(np.min(rewards))) # -- plot learning curve -- pos_greedy = [] for pos in range(0, len(lower_reward)): pos_greedy.append(pos * greedy_episode) plt.title('Greedy policy outcome') plt.fill_between(pos_greedy, lower_reward, upper_reward, facecolor='red', alpha=0.3) plt.plot(pos_greedy, avg_greedy_reward, 'r') plt.xlabel('Number of episodes') plt.ylabel('Rewards') fname1 = 'plot1_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname1) plt.close() plt.title('Steps to reach goal') plt.plot(steps_to_goal) plt.ylabel('Number of steps') plt.xlabel('Number of episodes') fname2 = 'plot2_obs_{}_{}_{}'.format(args.env_name, args.batch_size, '.png') plt.savefig(fname2) plt.close()
help='number of episodes (default: 128)') parser.add_argument('--updates_per_step', type=int, default=5, metavar='N', help='model updates per simulator step (default: 5)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') args = parser.parse_args() env = NormalizedActions(gym.make(args.env_name)) writer = SummaryWriter() env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes):
def main(): parser = argparse.ArgumentParser(description='PyTorch X-job') parser.add_argument('--env_name', default="Pendulum-v0", help='name of the environment') parser.add_argument('--gamma', type=float, default=0.99, metavar='G', help='discount factor for reward (default: 0.99)') parser.add_argument('--tau', type=float, default=0.001, help='discount factor for model (default: 0.001)') parser.add_argument('--ou_noise', type=bool, default=True) parser.add_argument('--noise_scale', type=float, default=0.4, metavar='G', help='initial noise scale (default: 0.3)') parser.add_argument('--final_noise_scale', type=float, default=0.3, metavar='G', help='final noise scale (default: 0.4)') parser.add_argument('--exploration_end', type=int, default=33, metavar='N', help='number of episodes with noise (default: 100)') parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=200, metavar='N', help='batch size (default: 512)') parser.add_argument('--num_steps', type=int, default=100, metavar='N', help='max episode length (default: 300)') parser.add_argument('--num_episodes', type=int, default=5000, metavar='N', help='number of episodes (default: 5000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='hidden size (default: 128)') parser.add_argument('--updates_per_step', type=int, default=5, metavar='N', help='model updates per simulator step (default: 50)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') parser.add_argument('--save_agent', type=bool, default=True, help='save model to file') parser.add_argument('--train_model', type=bool, default=True, help='Training or run') parser.add_argument('--load_agent', type=bool, default=False, help='load model from file') parser.add_argument('--load_exp', type=bool, default=False, help='load saved experience') parser.add_argument('--greedy_steps', type=int, default=10, metavar='N', help='amount of times greedy goes (default: 10)') args = parser.parse_args() env = ManipulateEnv() #env = gym.make(args.env_name) writer = SummaryWriter('runs/') env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # -- initialize agent -- agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) # -- declare memory buffer and random process N memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None # -- load existing model -- if args.load_agent: agent.load_model(args.env_name, args.batch_size, args.num_episodes, '.pth') print("agent: naf_{}_{}_{}_{}, is loaded".format( args.env_name, args.batch_size, args.num_episodes, '.pth')) # -- load experience buffer -- if args.load_exp: with open( '/home/quantao/Workspaces/catkin_ws/src/panda_demos/naf_env/src/exp_replay.pk1', 'rb') as input: memory.memory = pickle.load(input) memory.position = len(memory) rewards = [] total_numsteps = 0 updates = 0 #env.init_ros() #env.reset() t_start = time.time() for i_episode in range(args.num_episodes + 1): # -- reset environment for every episode -- #state = env.reset() state = torch.Tensor([env.reset()]) # -- initialize noise (random process N) -- if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode / args.exploration_end + args.final_noise_scale) ounoise.reset() episode_reward = 0 while True: # -- action selection, observation and store transition -- action = agent.select_action( state, ounoise) if args.train_model else agent.select_action(state) next_state, reward, done, info = env.step(action) #env.render() total_numsteps += 1 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) reward = torch.Tensor([reward]) next_state = torch.Tensor([next_state]) #print('reward:', reward) memory.push(state, action, mask, next_state, reward) state = next_state #else: # time.sleep(0.005) #env.render() #time.sleep(0.005) #env.rate.sleep() if done or total_numsteps % args.num_steps == 0: break if len(memory) >= args.batch_size and args.train_model: env.reset() print("Training model") for _ in range(args.updates_per_step * args.num_steps): transitions = memory.sample(args.batch_size) batch = Transition(*zip(*transitions)) value_loss, policy_loss = agent.update_parameters(batch) writer.add_scalar('loss/value', value_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) updates += 1 writer.add_scalar('reward/train', episode_reward, i_episode) print("Train Episode: {}, total numsteps: {}, reward: {}".format( i_episode, total_numsteps, episode_reward)) rewards.append(episode_reward) greedy_numsteps = 0 if i_episode % 10 == 0: #state = env.reset() state = torch.Tensor([env.reset()]) episode_reward = 0 while True: action = agent.select_action(state) next_state, reward, done, info = env.step(action) episode_reward += reward greedy_numsteps += 1 #state = next_state state = torch.Tensor([next_state]) #env.render() #time.sleep(0.01) # env.rate.sleep() if done or greedy_numsteps % args.num_steps == 0: break writer.add_scalar('reward/test', episode_reward, i_episode) rewards.append(episode_reward) print( "Episode: {}, total numsteps: {}, reward: {}, average reward: {}" .format(i_episode, total_numsteps, rewards[-1], np.mean(rewards[-10:]))) #-- saves model -- if args.save_agent: agent.save_model(args.env_name, args.batch_size, args.num_episodes, '.pth') with open('exp_replay.pk1', 'wb') as output: pickle.dump(memory.memory, output, pickle.HIGHEST_PROTOCOL) print('Training ended after {} minutes'.format( (time.time() - t_start) / 60)) print('Time per episode: {} s'.format( (time.time() - t_start) / args.num_episodes)) print('Mean reward: {}'.format(np.mean(rewards))) print('Max reward: {}'.format(np.max(rewards))) print('Min reward: {}'.format(np.min(rewards)))
args = parser.parse_args() env = NormalizedActions(gym.make(args.env_name)) writer = SummaryWriter() # setup cuda if torch.cuda.is_available(): torch.set_default_tensor_type('torch.cuda.FloatTensor') env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent: Union[NAF, DDPG, SPG] = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) elif args.algo == "SPG": agent = SPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], args.num_noises, env.action_space) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None
env = sys.argv[1] args = None if env == 'mc': args = args_mc elif env == 'pd': args = args_pd elif env == 'll': args = args_ll else: print('Environment not selected, Please choose from: mc, pd,ll') exit(-1) env = NormalizedActions(gym.make(args['env_name'])) env.seed(args['seed']) torch.manual_seed(args['seed']) np.random.seed(args['seed']) agent = NAF(args['gamma'], args['tau'], args['hidden_size'], env.observation_space.shape[0], env.action_space) agent.load_model(f'models/naf_{args["env_name"]}') replay_buffer = ReplayBuffer(args['replay_size']) ounoise = OUNoise(env.action_space.shape[0]) if args['ou_noise'] else None run() plot_results()
print("Folder ready") if torch.cuda.is_available(): device = torch.device('cuda:0') print('cuda') else: device = torch.device('cpu') print('cpu') memory = Memory(memorySize) env = gym.make('FetchSlide-v1') print("Env created") state = env.reset() print(f'Env tested: {state}') agent = NAF( gamma, tau, hiddenSize, env.observation_space["observation"].shape[0] + env.observation_space["achieved_goal"].shape[0] + env.observation_space["desired_goal"].shape[0], env.action_space.shape[0] - 1, device) print("Agent created") totalNumSteps = 0 updates = 0 rewards = [] for episode in range(numEpisodes): shortMemory = Memory(memorySize) state = env.reset() startingPositionPuck = state["achieved_goal"] orginalDistance = np.linalg.norm(startingPositionPuck - desiredGoal) state = stateToTensor(state, desiredGoal).to(device) valueLossEp = 0 while True:
metavar='N', help='model updates per simulator step (default: 5)') parser.add_argument('--replay_size', type=int, default=10000, metavar='N', help='size of replay buffer (default: 1000000)') args = parser.parse_args() # env = NormalizedActions(gym.make(args.env_name)) env = Arm() # env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.algo == "NAF": agent = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space.shape[0]) else: agent = DDPG(args.gamma, args.tau, args.hidden_size, env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=args.final_noise_scale, adaptation_coefficient=1.05) if args.param_noise else None if load: state = torch.Tensor([env.reset()]) agent.load_model("./models/ddpg_actor_dual_arms_3dof_inverse", "./models/ddpg_critic_dual_arms_3dof_inverse") state = torch.Tensor([env.reset()])
def fit_nash(): suffix = 'Nash_{}_RC_{}_AttackMode_{}_RewardMode_{}'.format(args.NashMode, RC, args.AttackMode, args.RewardMode) # reward_file = open('reward' + suffix + '.txt', 'w') # attack_file = open('attacker_action' + suffix + '.txt', 'w') # weight_file = open('vehicle_weight' + suffix + '.txt', 'w') # distance_file = open('Distance' + suffix + '.txt', 'w') # reward_file.write(""" # Environment Initializing... # The initial head car velocity is {} # The initial safe distance is {} # The Nash Eq* Factor RC is {} # The Reward Calculation Mode is {} # The Attack Mode is {} # The Nash Mode is {} # """.format(env.v_head, env.d0, RC, env.reward_mode, env.attack_mode, args.Nash)) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space, 'veh') agent_attacker = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.attacker_action_space, 'att') try: agent_vehicle.load_model('models/vehicle_' + suffix) print('Load vehicle RL model successfully') except: print('No existed vehicle RL model') try: agent_attacker.load_model('models/attacker_' + suffix) print('Load attacker RL model successfully') except: print('No existed attacker RL model') try: policy_vehicle = load_model('models/vehicle_' + suffix + '.h5') print('Load vehicle SL model successfully') except: policy_vehicle = create_SL_model(env.observation_space, env.vehicle_action_space, 'vehicle') try: policy_attacker = load_model('models/attacker_' + suffix + '.h5') print('Load attacker SL model successfully') except: policy_attacker = create_SL_model(env.observation_space, env.attacker_action_space, 'attacker') print('*'*20, '\n\n\n') memory_vehicle = ReplayMemory(100000) memory_attacker = ReplayMemory(100000) memory_SL_vehicle = ReplayMemory(400000) memory_SL_attacker = ReplayMemory(400000) ounoise_vehicle = OUNoise(env.vehicle_action_space) if args.ou_noise else None ounoise_attacker = OUNoise(env.attacker_action_space) if args.ou_noise else None param_noise_vehicle = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None param_noise_attacker = AdaptiveParamNoiseSpec(initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None res_data = pd.DataFrame(columns=['Weight', 'Attack', 'Eva_distance']) reward_data = pd.DataFrame(columns=['Reward']) rewards = [] total_numsteps = 0 for i_episode in range(args.num_episodes): if i_episode % 100 == 0 and i_episode != 0: print('Writing to CSV files...') reward_data.to_csv(suffix + '.csv', index=False) res_data.to_csv(suffix + '.csv', index=False) if args.NashMode == 0: ETA = 0 elif args.NashMode == 1: ETA = 0.5 elif args.NashMode == 2: ETA = 0.1 - i_episode/args.num_episodes * 0.1 print('No.{} episode starts... ETA is {}'.format(i_episode, ETA)) # reward_file = open('reward' + suffix + '.txt', 'a') # attack_file = open('attacker_action' + suffix + '.txt', 'a') # weight_file = open('vehicle_weight' + suffix + '.txt', 'a') # distance_file = open('Distance' + suffix + '.txt', 'a') local_steps = 0 state = env.reset() state_record = [np.array([state])] episode_steps = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) if args.ou_noise: ounoise_vehicle.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_vehicle.reset() ounoise_attacker.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise_attacker.reset() episode_reward = 0 local_steps = 0 while True: sigma = random.random() if sigma > ETA: # print(state_record[-20:]) # print('rl', torch.Tensor(state_record[-20:]).shape) action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] # print('rl', action_vehicle.shape) else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4)) / policy_attacker.predict( state_record[-1].reshape(-1, 4)).sum()])[0] # 限制权重和为1 action_vehicle = action_vehicle.numpy()[0]/(action_vehicle.numpy()[0].sum()) action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker) res_data = res_data.append([{'Attack':env.action_attacker, 'Weight':action_vehicle, 'Eva_distance':env.d}]) # 将处理的攻击值赋给原值 action_attacker = env.action_attacker total_numsteps += 1 episode_reward += reward state_record.append(next_state) local_steps += 1 episode_steps += 1 if sigma > ETA: memory_SL_vehicle.append(state_record[-1], action_vehicle) memory_SL_attacker.append(state_record[-1], action_attacker) action_vehicle = torch.Tensor(action_vehicle.reshape(1,4)) action_attacker = torch.Tensor(action_attacker.reshape(1,4)) mask = torch.Tensor([not done]) prev_state = torch.Tensor(state_record[-20:]).transpose(0, 1) next_state = torch.Tensor([next_state]) reward_vehicle = torch.Tensor([reward]) reward_attacker = torch.Tensor([RC - reward]) memory_vehicle.push(prev_state, torch.Tensor(action_vehicle), mask, next_state, reward_vehicle) memory_attacker.push(prev_state, torch.Tensor(action_attacker), mask, next_state, reward_attacker) if done: rewards.append(episode_reward) print('Episode {} ends, instant reward is {:.2f}'.format(i_episode, episode_reward)) reward_data = reward_data.append([{'Reward': episode_reward}]) # reward_file.write('Episode {} ends, instant reward is {:.2f}\n'.format(i_episode, episode_reward)) break if min(len(memory_vehicle), len(memory_SL_vehicle)) > args.batch_size: # 开始训练 for _ in range(args.updates_per_step): transitions_vehicle = memory_vehicle.sample(args.batch_size) batch_vehicle = Transition(*zip(*transitions_vehicle)) transitions_attacker = memory_attacker.sample(args.batch_size) batch_attacker = Transition(*zip(*transitions_attacker)) trans_veh = memory_SL_vehicle.sample(args.batch_size) trans_att = memory_SL_attacker.sample(args.batch_size) states_veh = [] actions_veh = [] states_att = [] actions_att = [] for sample in trans_veh: state_veh, act_veh = sample states_veh.append(state_veh) actions_veh.append(act_veh) for sample in trans_att: state_att, act_att = sample states_att.append(state_att) actions_att.append(act_att) states_veh = np.reshape(states_veh, (-1, env.observation_space)) states_att = np.reshape(states_att, (-1, env.observation_space)) actions_veh = np.reshape(actions_veh, (-1, env.vehicle_action_space)) actions_att = np.reshape(actions_att, (-1, env.attacker_action_space)) policy_vehicle.fit(states_veh, actions_veh, verbose=False) policy_attacker.fit(states_att, actions_att, verbose=False) agent_vehicle.update_parameters(batch_vehicle) agent_attacker.update_parameters(batch_attacker) # writer.add_scalar('loss/value', value_loss, updates) # writer.add_scalar('loss/policy', policy_loss, updates) if i_episode % 10 == 0 and i_episode != 0: eva_res_data = pd.DataFrame(columns=['Eva_reward', 'Eva_distance']) # distance_file.write('{} episode starts, recording distance...\n'.format(i_episode)) state = env.reset() state_record = [np.array([state])] evaluate_reward = 0 while len(state_record) < 20: a, b = env.random_action() s, _, _ = env.step(np.array([a]), np.zeros(4)) local_steps += 1 state_record.append(s) while True: if random.random() < ETA: action_vehicle = agent_vehicle.select_action(torch.Tensor(state_record[-20:]), ounoise_vehicle, param_noise_vehicle)[:, -1, :] # print('rl', action_vehicle.shape) action_attacker = agent_attacker.select_action(torch.Tensor(state_record[-20:]), ounoise_attacker, param_noise_attacker)[:, -1, :] else: action_vehicle = torch.Tensor( [policy_vehicle.predict(state_record[-1].reshape(-1, 4)) / policy_vehicle.predict( state_record[-1].reshape(-1, 4)).sum()])[0] action_attacker = torch.Tensor( [policy_attacker.predict(state_record[-1].reshape(-1, 4))])[0] action_vehicle = action_vehicle.numpy()[0] / action_vehicle.numpy()[0].sum() action_attacker = action_attacker.numpy()[0] next_state, reward, done = env.step(action_vehicle, action_attacker, attack_mode=2) eva_res_data = eva_res_data.append([{'Eva_reward':evaluate_reward, 'Eva_distance':env.d}]) evaluate_reward += reward if done: print("Episode: {}, total numsteps: {}, reward: {}, average reward: {}".format(i_episode, total_numsteps, evaluate_reward, np.mean(rewards[-10:]))) # reward_file.write("Episode: {}, total numsteps: {}, reward: {}, average reward: {}\n".format(i_episode, # total_numsteps, # evaluate_reward, # np.mean(rewards[-10:]))) break # # writer.add_scalar('reward/test', episode_reward, i_episode) # reward_file.close() # attack_file.close() # weight_file.close() # distance_file.close() env.close() reward_data.to_csv(suffix+'_reward.csv', index=False) res_data.to_csv(suffix+'.csv', index=False) eva_res_data.to_csv(suffix+'_eva.csv', index=False) # save model agent_vehicle.save_model('vehicle_'+suffix) agent_attacker.save_model('attacker_'+suffix) policy_attacker.save('models/attacker_'+suffix+'.h5') policy_vehicle.save('models/vehicle_'+suffix+'.h5')