def main(arguments: argparse) -> None: """ Main training loop. :param arguments: User input :return: """ n_steps = arguments.steps n_agents = arguments.envs print(f'Training {args.game} using {"cpu" if arguments.cpu else "gpu"}') print(f'Number of concurrent environments {args.envs}') print(f'Number of steps per batch {args.steps}') if arguments.model: print(f'Using existing model {arguments.model}') env = SubprocVecEnv( [make_env(env_id=arguments.game, rank=i) for i in range(n_agents)]) agent = DeepLearningAgent(observation_space=env.observation_space, action_space=int(env.action_space.n), n_envs=n_agents, n_steps=n_steps, model_path=arguments.model, use_cpu=arguments.cpu) # This is the current state (or observation) observations = reshape_observations(env.reset()) actions = agent.get_action(observations) initial_training_time = time.time() for ep in range(EPISODES): # Reset the frame counter each time the batch size is complete for i in range(n_steps): new_observations, rewards, done, info = env.step( actions.cpu().numpy()) new_observations = reshape_observations(new_observations) agent.train(s=observations, r=rewards, s_next=new_observations, a=actions, done=done, step=i) actions = agent.get_action(new_observations) observations = new_observations if ep % 100 == 0: fps = ((ep + 1) * n_steps * n_agents) / (time.time() - initial_training_time) print(f'FPS {fps}') env.close()
def main(): # Alter reward in scenario.json (C:\Users\Fergus\Anaconda3\envs\AIGym\Lib\site-packages\retro\data\stable\SonicTheHedgehog-Genesis) env = SubprocVecEnv([make_env_3]) obs = env.reset() # env = make_env_3() # env2 = make_env_4() print(env.observation_space) print(env.action_space.n) print(obs.shape) print(obs[0].shape) # obs = env2.reset() rew_mb = [] dones_mb = [] obs_mb = [] step = 0 while True: action = env.action_space.sample() obs, rew, done, info = env.step([0]) print("Step {} Reward: {}, Done: {}".format(step, rew, done)) rew_mb.append(rew) dones_mb.append(done) obs_mb.append(obs) env.render() step += 1 # obs = obs[1] / 255. # for i in range(4): # cv2.imshow('GrayScale'+str(i), np.squeeze(obs[:,:,i])) # cv2.waitKey(1) if done[0]: env.close() break rew_mb = np.array(rew_mb) dones_mb = np.array(dones_mb) obs_mb = np.array(obs_mb) print("Rewards: ", rew_mb) print(rew_mb.shape) print(dones_mb) print(dones_mb.shape) print(obs_mb.shape)
def main(hParams, n_run, total_timesteps): nsteps = hParams['N_STEPS'] n_epochs = hParams['N_EPOCHS'] n_train = 4 n_minibatch = 8 log_loss_int = 1 save_int = 5 test_int = 10 test_episodes = 5 gamma = 0.95 lr = hParams[HP_LEARNING_RATE] vf_coef = hParams[HP_VF_COEF] ent_coef = hParams[HP_ENT_COEF] save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str(ent_coef) testenvfn = SonicEnv.make_env_3 current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/sonic_long_test/run-' + str(n_run) summ_writer = tf.summary.create_file_writer(log_dir) env = SubprocVecEnv([SonicEnv.make_env_3]) nenv = env.num_envs state_size = env.observation_space.shape num_actions = env.action_space.n pgnet = PGNetwork(state_size, num_actions, lr=lr, vf_coef=vf_coef, ent_coef=ent_coef) # Runner used to create training data runner = SonicEnvRunner(env, pgnet, nsteps, gamma) # total_timesteps = int(n_epochs * nsteps * nenv) nbatch = nenv * nsteps print("Total updates to run: ", total_timesteps // nbatch) for update in range(1, total_timesteps // nbatch + 1): print("\nUpdate #{}:".format(update)) states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run( ) for _ in range(n_train): indices = np.arange(nbatch) np.random.shuffle(indices) for start in range(0, nbatch, nbatch // n_minibatch): end = start + nbatch // n_minibatch bind = indices[start:end] policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient( states_mb[bind], actions_mb[bind], rewards_mb[bind], values_mb[bind]) WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update) r2 = 1 - (np.var(rewards_mb - values_mb) / np.var(rewards_mb)) with summ_writer.as_default(): tf.summary.scalar("PolicyLoss", policy_loss, step=update) tf.summary.scalar("EntropyLoss", entropy_loss, step=update) tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update) tf.summary.scalar("Loss", loss, step=update) tf.summary.scalar("R-squared", r2, step=update) if update % log_loss_int == 0: print("PolicyLoss:", policy_loss) print("EntropyLoss: ", entropy_loss) print("ValueFunctionLoss: ", vf_loss) print("Loss: ", loss) if update % save_int == 0: pgnet.model.save_weights('sonic_long_test/' + save_dir + '/my_checkpoint') print("Model Saved") if update % test_int == 0: TestRewardWriter(summ_writer, testenvfn, pgnet, test_episodes, global_step=update) with summ_writer.as_default(): hp.hparams(hParams) env.close()
def main(hParams, n_run): nsteps = hParams['N_STEPS'] nenv = hParams[HP_N_ENV] n_epochs = hParams['N_EPOCHS'] total_timesteps = int(n_epochs * nsteps * nenv) nbatch = nenv * nsteps update_int = 1 save_int = 5 test_int = 10 gamma = 0.99 lr = hParams[HP_LEARNING_RATE] vf_coef = hParams[HP_VF_COEF] ent_coef = hParams[HP_ENT_COEF] save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str( ent_coef) + 'env' + str(nenv) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/cart_hparam_tuning/run-' + str(n_run) summ_writer = tf.summary.create_file_writer(log_dir) envfn = lambda: gym.make('CartPole-v1') env = SubprocVecEnv([envfn] * nenv) state_size = env.observation_space.shape num_actions = env.action_space.n pgnet = SimplePGNet(state_size, num_actions, learning_rate=lr, vf_coef=vf_coef, ent_coef=ent_coef) runner = SonicEnvRunner(env, pgnet, nsteps, gamma) print("Total updates to run: ", total_timesteps // nbatch) for update in range(1, total_timesteps // nbatch + 1): print("\nUpdate #{}:".format(update)) states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run( ) tf.summary.trace_on(graph=True) policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient( states_mb, actions_mb, rewards_mb, values_mb) if update == 1: with summ_writer.as_default(): tf.summary.trace_export(name="grad_trace", step=0) WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update) with summ_writer.as_default(): tf.summary.scalar("PolicyLoss", policy_loss, step=update) tf.summary.scalar("EntropyLoss", entropy_loss, step=update) tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update) tf.summary.scalar("Loss", loss, step=update) if update % update_int == 0: print("PolicyLoss:", policy_loss) print("EntropyLoss: ", entropy_loss) print("ValueFunctionLoss: ", vf_loss) print("Loss: ", loss) if update % save_int == 0: pgnet.model.save_weights('cart_hparams_tuning_models/' + save_dir + '/my_checkpoint') print("Model Saved") if update % test_int == 0: test_rewards = TestRewardWriter(summ_writer, envfn, pgnet, 20, global_step=update) print("Test Rewards: ", test_rewards) with summ_writer.as_default(): hp.hparams(hParams) env.close()