def eval_policy(policy, env_name, broken_info=False, eval_episodes=1, real_robot=False, seed=0): env_seed = 2**32 - 1 - seed if real_robot: eval_env = gym.make(env_name, device_path='/dev/tty.usbserial-FT3WI485') else: eval_env = gym.make(env_name) eval_env.seed(env_seed) avg_reward = 0. for _ in range(eval_episodes): state, done = eval_env.reset(), False state = utils.trim_state(state) if broken_info: state = np.concatenate((state, np.ones(9))) while not done: action = policy.select_action(np.array(state), 'test') state, reward, done, _ = eval_env.step(action) state = utils.trim_state(state) avg_reward += reward if broken_info: state = np.concatenate((state, np.ones(9))) avg_reward /= eval_episodes print("---------------------------------------") print("Evaluation over {} episodes: {:.3f}".format(eval_episodes, avg_reward)) print("---------------------------------------") return avg_reward
def step(adversarial_actions: int, current_state): "input obs obs is processed and its broken info must be all 1s" "return unprocessed next_state" if args.broken_info: current_state = np.concatenate((current_state, np.ones(9))) for ad_action in adversarial_actions: current_state[original_state_dim + ad_action] = 0 # broken_timesteps = 1 total_done = False reward_list = [] for i in range(args.broken_timesteps): agent_action = agent.select_action(current_state, evaluate=True) for ad_action in adversarial_actions: agent_action[ad_action] = -0.6 next_state, reward, done, info = env.step(agent_action) original_next_state = next_state if args.trim_state: next_state = utils.trim_state(next_state) if args.broken_info: joint_info = np.ones(9) for ad_action in adversarial_actions: joint_info[ad_action] = 0 next_state = np.concatenate((next_state, joint_info)) reward_list.append(reward) if done: total_done = done break current_state = next_state avg_reward = np.array(reward_list).mean() return original_next_state, avg_reward, total_done, info
def step(adversarial_action: int, ddpg_obs): current_state = ddpg_obs current_state[original_state_dim + adversarial_action] = 0 broken_timesteps = 1 total_done = False reward_list = [] for i in range(broken_timesteps): ddpg_action = ddpg.select_action(current_state, 'test') ddpg_action[adversarial_action] = -0.6 next_state, reward, done, info = env.step(ddpg_action) original_next_state = next_state next_state = utils.trim_state(next_state) next_state = np.concatenate((next_state, np.ones(9))) next_state[original_state_dim + adversarial_action] = 0 reward_list.append(reward) if done: total_done = done break current_state = next_state avg_reward = np.array(reward_list).mean() return original_next_state, avg_reward, total_done, info
t = 0 agent_t = 0 adversary_t = 0 done = False minimal_indexes = [0, 0] for i_episode in itertools.count(1): if t > args.max_timesteps: break for agent_episode in range(args.agent_training_episodes): done = False " the agent training loop" current_state = env.reset() if args.trim_state: current_state = utils.trim_state(current_state) if args.broken_info: joint_info = np.ones(9) current_state = np.concatenate((current_state, joint_info)) episode_steps = 0 while not done: t += 1 agent_t += 1 if args.broken_info: for minimal_index in minimal_indexes: current_state[original_state_dim + minimal_index] = 0 if agent_t == args.start_timesteps: print("start ddpg learning") if agent_t < args.start_timesteps:
max_action = env.action_space.high[0] agent = SAC(num_inputs=state_dim, action_space=env.action_space, args=args, writer=writer, outdir=outdir, device=device) total_numsteps = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() if args.trim_state: state = utils.trim_state(state) while not done: if args.start_timesteps > total_numsteps: action = env.action_space.sample() # Sample random action else: action = agent.select_action( state) # Sample action from policy if len(agent.replay_buffer) > args.batch_size: # Number of updates per step in environment for i in range(1): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters( )