Beispiel #1
0
LEARN_FREQ = 4
# quantile and option numbers for QUOTA
N_QUANT = 200
N_OPTIONS = 10
'''Environment Settings'''
# number of environments for C51
N_ENVS = 16
# Total simulation step
STEP_NUM = int(1e+8)
# gamma for MDP
GAMMA = 0.99
# visualize for agent playing
RENDERING = False
# openai gym env name
ENV_NAME = args.games + 'NoFrameskip-v4'
env = SubprocVecEnv([wrap_cover(ENV_NAME) for i in range(N_ENVS)])
N_ACTIONS = env.action_space.n
N_STATES = env.observation_space.shape
'''Training settings'''
# check GPU usage
USE_GPU = torch.cuda.is_available()
print('USE GPU: ' + str(USE_GPU))
# mini-batch size
BATCH_SIZE = 32
# learning rage
LR = 1e-4
# epsilon-greedy
EPSILON = 1.0
EPSILON_O = 1.0
# option paramater
Target_beta = 0.01
Beispiel #2
0
def train(args):
    print(args)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    if args.return_function == "GAE":
        return_function = GAE
    elif args.return_function == "Q":
        return_function = Q
    elif args.return_function == "A":
        return_function = A

    MONTE_CARLO = True if args.num_steps == 200 else False

    envs = SubprocVecEnv(
        [make_env(args.env, i + args.num_envs) for i in range(args.num_envs)],
        MONTE_CARLO)
    test_env = gym.make(args.env)
    test_env.seed(args.seed + args.num_envs)
    policy = ActorCriticMLP(input_dim=envs.observation_space.shape[0],
                            n_acts=envs.action_space.n)
    optim = torch.optim.Adam(params=policy.parameters(),
                             lr=args.lr,
                             weight_decay=args.weight_decay)

    test_rewards = []
    steps = 1

    obs = torch.from_numpy(envs.reset())
    while steps < args.max_steps:
        logp_actions = []
        state_values = []
        rewards = []
        masks = []

        for _ in range(args.num_steps):
            probs, state_value = policy.forward(obs)
            dist = Categorical(probs)
            action = dist.sample()

            obs, reward, done, _ = envs.step(action.numpy())

            logp_actions.append(dist.log_prob(action).unsqueeze(1))
            state_values.append(state_value)
            rewards.append(torch.FloatTensor(reward).unsqueeze(1))
            masks.append(torch.FloatTensor(1 - done).unsqueeze(1))
            obs = torch.from_numpy(obs)
            steps += 1

            if steps % args.test_every == 0:
                test_reward = np.mean(
                    [test(test_env, policy) for _ in range(10)])
                test_rewards.append(test_reward)
                print(f"Running reward at timestep {steps}: and {test_reward}")

            if (1 - done).sum() == 0:
                break

        next_value = 0
        if not (1 - done).sum() == 0:
            _, next_value = policy(obs)

        returns = return_function(next_value, rewards, masks, state_values,
                                  args)
        loss = policy_gradient(logp_actions, returns)

        optim.zero_grad()
        loss.backward()

        optim.step()
        # if monte carlo, we need to reset the environment by hand
        if MONTE_CARLO:
            obs = torch.from_numpy(envs.reset())
    return test_rewards
Beispiel #3
0
if __name__ == '__main__':
    args = parser.parse_args()
    print(args)

    tf.set_random_seed(args.seed)
    np.random.seed(args.seed)

    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)

    def make_env(rank):
        def env_fn():
            env = gym.make('{}NoFrameskip-v4'.format(args.env_name))
            env.seed(args.seed + rank)
            env = Monitor(
                env, osp.join(args.log_dir, "{}.monitor.json".format(rank)))
            gym.logger.setLevel(logging.WARN)
            return wrap_deepmind(env, num_skips=args.num_skips)

        return env_fn

    env = SubprocVecEnv([make_env(i) for i in range(args.nenvs)])
    policy = CnnPolicy
    learn(policy,
          env,
          args.seed,
          args.num_skips,
          args.model_path,
          gamma=args.gamma)
Beispiel #4
0
    return rewards


if __name__ == '__main__':
    logger.configure(f'{C.env_id}/logs_{time_stamp}')
    for k, v in C._asdict().items():
        logger.record_tabular(k, v)
    logger.dump_tabular()
    max_reward = tf.placeholder(tf.float32, name='max_reward')
    mean_reward = tf.placeholder(tf.float32, name='mean_reward')
    max_summary = tf.summary.scalar('max_rew', max_reward)
    mean_summary = tf.summary.scalar('mean_rew', mean_reward)

    with create_session(0) as sess:
        eval_env = make_atari(C.env_id, 113, 'eval')()
        envs = SubprocVecEnv(
            [make_atari(C.env_id, r + 1, 'train') for r in range(4)])
        model = Model(eval_env.observation_space.shape,
                      eval_env.action_space.n)
        runner = Runner(envs, model.policy, nb_rollout=C.nb_rollout)
        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(
            './{}/summary/{}'.format(C.env_id, time_stamp), sess.graph)

        for i in range(C.iterations):
            if i % C.eval_freq == 0:
                rewards = evaluate(eval_env, model.policy, C.eval_episodes)
                logger.log(
                    f'Step: {i} | Max reward: {np.max(rewards)} | Mean reward: {np.mean(rewards):.2f} | Std: {np.std(rewards):.2f}'
                )
                me, ma = sess.run([mean_summary, max_summary],
                                  feed_dict={