LEARN_FREQ = 4 # quantile and option numbers for QUOTA N_QUANT = 200 N_OPTIONS = 10 '''Environment Settings''' # number of environments for C51 N_ENVS = 16 # Total simulation step STEP_NUM = int(1e+8) # gamma for MDP GAMMA = 0.99 # visualize for agent playing RENDERING = False # openai gym env name ENV_NAME = args.games + 'NoFrameskip-v4' env = SubprocVecEnv([wrap_cover(ENV_NAME) for i in range(N_ENVS)]) N_ACTIONS = env.action_space.n N_STATES = env.observation_space.shape '''Training settings''' # check GPU usage USE_GPU = torch.cuda.is_available() print('USE GPU: ' + str(USE_GPU)) # mini-batch size BATCH_SIZE = 32 # learning rage LR = 1e-4 # epsilon-greedy EPSILON = 1.0 EPSILON_O = 1.0 # option paramater Target_beta = 0.01
def train(args): print(args) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.return_function == "GAE": return_function = GAE elif args.return_function == "Q": return_function = Q elif args.return_function == "A": return_function = A MONTE_CARLO = True if args.num_steps == 200 else False envs = SubprocVecEnv( [make_env(args.env, i + args.num_envs) for i in range(args.num_envs)], MONTE_CARLO) test_env = gym.make(args.env) test_env.seed(args.seed + args.num_envs) policy = ActorCriticMLP(input_dim=envs.observation_space.shape[0], n_acts=envs.action_space.n) optim = torch.optim.Adam(params=policy.parameters(), lr=args.lr, weight_decay=args.weight_decay) test_rewards = [] steps = 1 obs = torch.from_numpy(envs.reset()) while steps < args.max_steps: logp_actions = [] state_values = [] rewards = [] masks = [] for _ in range(args.num_steps): probs, state_value = policy.forward(obs) dist = Categorical(probs) action = dist.sample() obs, reward, done, _ = envs.step(action.numpy()) logp_actions.append(dist.log_prob(action).unsqueeze(1)) state_values.append(state_value) rewards.append(torch.FloatTensor(reward).unsqueeze(1)) masks.append(torch.FloatTensor(1 - done).unsqueeze(1)) obs = torch.from_numpy(obs) steps += 1 if steps % args.test_every == 0: test_reward = np.mean( [test(test_env, policy) for _ in range(10)]) test_rewards.append(test_reward) print(f"Running reward at timestep {steps}: and {test_reward}") if (1 - done).sum() == 0: break next_value = 0 if not (1 - done).sum() == 0: _, next_value = policy(obs) returns = return_function(next_value, rewards, masks, state_values, args) loss = policy_gradient(logp_actions, returns) optim.zero_grad() loss.backward() optim.step() # if monte carlo, we need to reset the environment by hand if MONTE_CARLO: obs = torch.from_numpy(envs.reset()) return test_rewards
if __name__ == '__main__': args = parser.parse_args() print(args) tf.set_random_seed(args.seed) np.random.seed(args.seed) if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) def make_env(rank): def env_fn(): env = gym.make('{}NoFrameskip-v4'.format(args.env_name)) env.seed(args.seed + rank) env = Monitor( env, osp.join(args.log_dir, "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env, num_skips=args.num_skips) return env_fn env = SubprocVecEnv([make_env(i) for i in range(args.nenvs)]) policy = CnnPolicy learn(policy, env, args.seed, args.num_skips, args.model_path, gamma=args.gamma)
return rewards if __name__ == '__main__': logger.configure(f'{C.env_id}/logs_{time_stamp}') for k, v in C._asdict().items(): logger.record_tabular(k, v) logger.dump_tabular() max_reward = tf.placeholder(tf.float32, name='max_reward') mean_reward = tf.placeholder(tf.float32, name='mean_reward') max_summary = tf.summary.scalar('max_rew', max_reward) mean_summary = tf.summary.scalar('mean_rew', mean_reward) with create_session(0) as sess: eval_env = make_atari(C.env_id, 113, 'eval')() envs = SubprocVecEnv( [make_atari(C.env_id, r + 1, 'train') for r in range(4)]) model = Model(eval_env.observation_space.shape, eval_env.action_space.n) runner = Runner(envs, model.policy, nb_rollout=C.nb_rollout) sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter( './{}/summary/{}'.format(C.env_id, time_stamp), sess.graph) for i in range(C.iterations): if i % C.eval_freq == 0: rewards = evaluate(eval_env, model.policy, C.eval_episodes) logger.log( f'Step: {i} | Max reward: {np.max(rewards)} | Mean reward: {np.mean(rewards):.2f} | Std: {np.std(rewards):.2f}' ) me, ma = sess.run([mean_summary, max_summary], feed_dict={