Ejemplo n.º 1
0
def eval_policy(policy,
                env_name,
                broken_info=False,
                eval_episodes=1,
                real_robot=False,
                seed=0):
    env_seed = 2**32 - 1 - seed
    if real_robot:
        eval_env = gym.make(env_name,
                            device_path='/dev/tty.usbserial-FT3WI485')
    else:
        eval_env = gym.make(env_name)
    eval_env.seed(env_seed)

    avg_reward = 0.
    for _ in range(eval_episodes):
        state, done = eval_env.reset(), False
        state = utils.trim_state(state)
        if broken_info:
            state = np.concatenate((state, np.ones(9)))
        while not done:
            action = policy.select_action(np.array(state), 'test')
            state, reward, done, _ = eval_env.step(action)
            state = utils.trim_state(state)
            avg_reward += reward
            if broken_info:
                state = np.concatenate((state, np.ones(9)))

    avg_reward /= eval_episodes

    print("---------------------------------------")
    print("Evaluation over {} episodes: {:.3f}".format(eval_episodes,
                                                       avg_reward))
    print("---------------------------------------")
    return avg_reward
Ejemplo n.º 2
0
    def step(adversarial_actions: int, current_state):
        "input obs obs is processed and its broken info must be all 1s"
        "return unprocessed next_state"
        if args.broken_info:
            current_state = np.concatenate((current_state, np.ones(9)))
            for ad_action in adversarial_actions:
                current_state[original_state_dim + ad_action] = 0
        # broken_timesteps = 1

        total_done = False
        reward_list = []
        for i in range(args.broken_timesteps):
            agent_action = agent.select_action(current_state, evaluate=True)
            for ad_action in adversarial_actions:
                agent_action[ad_action] = -0.6
            next_state, reward, done, info = env.step(agent_action)
            original_next_state = next_state
            if args.trim_state:
                next_state = utils.trim_state(next_state)
            if args.broken_info:
                joint_info = np.ones(9)
                for ad_action in adversarial_actions:
                    joint_info[ad_action] = 0
                next_state = np.concatenate((next_state, joint_info))
            reward_list.append(reward)
            if done:
                total_done = done
                break
            current_state = next_state
        avg_reward = np.array(reward_list).mean()
        return original_next_state, avg_reward, total_done, info
Ejemplo n.º 3
0
    def step(adversarial_action: int, ddpg_obs):
        current_state = ddpg_obs
        current_state[original_state_dim + adversarial_action] = 0
        broken_timesteps = 1

        total_done = False
        reward_list = []
        for i in range(broken_timesteps):
            ddpg_action = ddpg.select_action(current_state, 'test')
            ddpg_action[adversarial_action] = -0.6
            next_state, reward, done, info = env.step(ddpg_action)
            original_next_state = next_state
            next_state = utils.trim_state(next_state)
            next_state = np.concatenate((next_state, np.ones(9)))
            next_state[original_state_dim + adversarial_action] = 0
            reward_list.append(reward)
            if done:
                total_done = done
                break
            current_state = next_state
        avg_reward = np.array(reward_list).mean()
        return original_next_state, avg_reward, total_done, info
Ejemplo n.º 4
0
    t = 0
    agent_t = 0
    adversary_t = 0

    done = False
    minimal_indexes = [0, 0]

    for i_episode in itertools.count(1):
        if t > args.max_timesteps:
            break
        for agent_episode in range(args.agent_training_episodes):
            done = False
            " the agent training loop"
            current_state = env.reset()
            if args.trim_state:
                current_state = utils.trim_state(current_state)
            if args.broken_info:
                joint_info = np.ones(9)
                current_state = np.concatenate((current_state, joint_info))
            episode_steps = 0

            while not done:
                t += 1
                agent_t += 1
                if args.broken_info:
                    for minimal_index in minimal_indexes:
                        current_state[original_state_dim + minimal_index] = 0

                if agent_t == args.start_timesteps:
                    print("start ddpg learning")
                if agent_t < args.start_timesteps:
Ejemplo n.º 5
0
    max_action = env.action_space.high[0]
    agent = SAC(num_inputs=state_dim,
                action_space=env.action_space,
                args=args,
                writer=writer,
                outdir=outdir,
                device=device)

    total_numsteps = 0
    for i_episode in itertools.count(1):
        episode_reward = 0
        episode_steps = 0
        done = False
        state = env.reset()
        if args.trim_state:
            state = utils.trim_state(state)

        while not done:
            if args.start_timesteps > total_numsteps:
                action = env.action_space.sample()  # Sample random action
            else:
                action = agent.select_action(
                    state)  # Sample action from policy

            if len(agent.replay_buffer) > args.batch_size:
                # Number of updates per step in environment
                for i in range(1):
                    # Update parameters of all the networks
                    critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(
                    )