예제 #1
0
def main(arguments: argparse) -> None:
    """
    Main training loop.
    :param arguments: User input
    :return:
    """
    n_steps = arguments.steps
    n_agents = arguments.envs

    print(f'Training {args.game} using {"cpu" if arguments.cpu else "gpu"}')
    print(f'Number of concurrent environments {args.envs}')
    print(f'Number of steps per batch {args.steps}')

    if arguments.model:
        print(f'Using existing model {arguments.model}')

    env = SubprocVecEnv(
        [make_env(env_id=arguments.game, rank=i) for i in range(n_agents)])
    agent = DeepLearningAgent(observation_space=env.observation_space,
                              action_space=int(env.action_space.n),
                              n_envs=n_agents,
                              n_steps=n_steps,
                              model_path=arguments.model,
                              use_cpu=arguments.cpu)

    # This is the current state (or observation)
    observations = reshape_observations(env.reset())
    actions = agent.get_action(observations)
    initial_training_time = time.time()

    for ep in range(EPISODES):
        # Reset the frame counter each time the batch size is complete
        for i in range(n_steps):
            new_observations, rewards, done, info = env.step(
                actions.cpu().numpy())
            new_observations = reshape_observations(new_observations)

            agent.train(s=observations,
                        r=rewards,
                        s_next=new_observations,
                        a=actions,
                        done=done,
                        step=i)

            actions = agent.get_action(new_observations)
            observations = new_observations

        if ep % 100 == 0:
            fps = ((ep + 1) * n_steps * n_agents) / (time.time() -
                                                     initial_training_time)
            print(f'FPS {fps}')

    env.close()
예제 #2
0
def main():
    # Alter reward in scenario.json (C:\Users\Fergus\Anaconda3\envs\AIGym\Lib\site-packages\retro\data\stable\SonicTheHedgehog-Genesis)

    env = SubprocVecEnv([make_env_3])
    obs = env.reset()
    # env = make_env_3()
    # env2 = make_env_4()
    print(env.observation_space)
    print(env.action_space.n)
    print(obs.shape)
    print(obs[0].shape)
    # obs = env2.reset()
    rew_mb = []
    dones_mb = []
    obs_mb = []
    step = 0
    while True:
        action = env.action_space.sample()
        obs, rew, done, info = env.step([0])
        print("Step {} Reward: {}, Done: {}".format(step, rew, done))
        rew_mb.append(rew)
        dones_mb.append(done)
        obs_mb.append(obs)
        env.render()

        step += 1
        # obs = obs[1] / 255.
        # for i in range(4):
        #     cv2.imshow('GrayScale'+str(i), np.squeeze(obs[:,:,i]))
        #     cv2.waitKey(1)
        if done[0]:
            env.close()
            break
    rew_mb = np.array(rew_mb)
    dones_mb = np.array(dones_mb)
    obs_mb = np.array(obs_mb)
    print("Rewards: ", rew_mb)
    print(rew_mb.shape)
    print(dones_mb)
    print(dones_mb.shape)
    print(obs_mb.shape)
예제 #3
0
def main(hParams, n_run, total_timesteps):
    nsteps = hParams['N_STEPS']
    n_epochs = hParams['N_EPOCHS']
    n_train = 4
    n_minibatch = 8

    log_loss_int = 1
    save_int = 5
    test_int = 10
    test_episodes = 5

    gamma = 0.95
    lr = hParams[HP_LEARNING_RATE]
    vf_coef = hParams[HP_VF_COEF]
    ent_coef = hParams[HP_ENT_COEF]
    save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str(ent_coef)
    testenvfn = SonicEnv.make_env_3

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/sonic_long_test/run-' + str(n_run)
    summ_writer = tf.summary.create_file_writer(log_dir)

    env = SubprocVecEnv([SonicEnv.make_env_3])

    nenv = env.num_envs
    state_size = env.observation_space.shape
    num_actions = env.action_space.n
    pgnet = PGNetwork(state_size,
                      num_actions,
                      lr=lr,
                      vf_coef=vf_coef,
                      ent_coef=ent_coef)

    #  Runner used to create training data
    runner = SonicEnvRunner(env, pgnet, nsteps, gamma)

    # total_timesteps = int(n_epochs * nsteps * nenv)
    nbatch = nenv * nsteps

    print("Total updates to run: ", total_timesteps // nbatch)
    for update in range(1, total_timesteps // nbatch + 1):

        print("\nUpdate #{}:".format(update))
        states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run(
        )

        for _ in range(n_train):
            indices = np.arange(nbatch)

            np.random.shuffle(indices)

            for start in range(0, nbatch, nbatch // n_minibatch):
                end = start + nbatch // n_minibatch
                bind = indices[start:end]
                policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient(
                    states_mb[bind], actions_mb[bind], rewards_mb[bind],
                    values_mb[bind])

        WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update)

        r2 = 1 - (np.var(rewards_mb - values_mb) / np.var(rewards_mb))

        with summ_writer.as_default():
            tf.summary.scalar("PolicyLoss", policy_loss, step=update)
            tf.summary.scalar("EntropyLoss", entropy_loss, step=update)
            tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update)
            tf.summary.scalar("Loss", loss, step=update)
            tf.summary.scalar("R-squared", r2, step=update)

        if update % log_loss_int == 0:
            print("PolicyLoss:", policy_loss)
            print("EntropyLoss: ", entropy_loss)
            print("ValueFunctionLoss: ", vf_loss)
            print("Loss: ", loss)

        if update % save_int == 0:
            pgnet.model.save_weights('sonic_long_test/' + save_dir +
                                     '/my_checkpoint')
            print("Model Saved")

        if update % test_int == 0:
            TestRewardWriter(summ_writer,
                             testenvfn,
                             pgnet,
                             test_episodes,
                             global_step=update)

    with summ_writer.as_default():
        hp.hparams(hParams)

    env.close()
예제 #4
0
def main(hParams, n_run):
    nsteps = hParams['N_STEPS']
    nenv = hParams[HP_N_ENV]
    n_epochs = hParams['N_EPOCHS']
    total_timesteps = int(n_epochs * nsteps * nenv)
    nbatch = nenv * nsteps

    update_int = 1
    save_int = 5
    test_int = 10

    gamma = 0.99
    lr = hParams[HP_LEARNING_RATE]
    vf_coef = hParams[HP_VF_COEF]
    ent_coef = hParams[HP_ENT_COEF]
    save_dir = 'lr' + str(lr) + 'vc' + str(vf_coef) + 'ec' + str(
        ent_coef) + 'env' + str(nenv)

    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/cart_hparam_tuning/run-' + str(n_run)
    summ_writer = tf.summary.create_file_writer(log_dir)

    envfn = lambda: gym.make('CartPole-v1')
    env = SubprocVecEnv([envfn] * nenv)
    state_size = env.observation_space.shape
    num_actions = env.action_space.n
    pgnet = SimplePGNet(state_size,
                        num_actions,
                        learning_rate=lr,
                        vf_coef=vf_coef,
                        ent_coef=ent_coef)

    runner = SonicEnvRunner(env, pgnet, nsteps, gamma)

    print("Total updates to run: ", total_timesteps // nbatch)
    for update in range(1, total_timesteps // nbatch + 1):

        print("\nUpdate #{}:".format(update))
        states_mb, actions_mb, values_mb, rewards_mb, next_dones_mb = runner.run(
        )

        tf.summary.trace_on(graph=True)
        policy_loss, entropy_loss, vf_loss, loss = pgnet.fit_gradient(
            states_mb, actions_mb, rewards_mb, values_mb)
        if update == 1:
            with summ_writer.as_default():
                tf.summary.trace_export(name="grad_trace", step=0)

        WeightWriter(summ_writer, pgnet, (Conv2D, Dense), global_step=update)

        with summ_writer.as_default():
            tf.summary.scalar("PolicyLoss", policy_loss, step=update)
            tf.summary.scalar("EntropyLoss", entropy_loss, step=update)
            tf.summary.scalar("ValueFunctionLoss", vf_loss, step=update)
            tf.summary.scalar("Loss", loss, step=update)

        if update % update_int == 0:
            print("PolicyLoss:", policy_loss)
            print("EntropyLoss: ", entropy_loss)
            print("ValueFunctionLoss: ", vf_loss)
            print("Loss: ", loss)

        if update % save_int == 0:
            pgnet.model.save_weights('cart_hparams_tuning_models/' + save_dir +
                                     '/my_checkpoint')
            print("Model Saved")

        if update % test_int == 0:
            test_rewards = TestRewardWriter(summ_writer,
                                            envfn,
                                            pgnet,
                                            20,
                                            global_step=update)
            print("Test Rewards: ", test_rewards)

    with summ_writer.as_default():
        hp.hparams(hParams)

    env.close()