Ejemplo n.º 1
0
def main(config: str, agent: str):
    with open(config) as fp:
        json_data = json.load(fp)

    config = GameConfig.deserialize(json_data)
    log_dir = config.agents_config[agent]["save_path"]
    if agent == "DQN":
        env = make_atari_env(config.game_name, n_envs=1,
                             seed=0, monitor_dir=log_dir)

    elif agent == "PPO":
        env = make_atari_env(config.game_name, n_envs=8,
                             seed=0, monitor_dir=log_dir)

    else:
        env = make_atari_env(config.game_name, n_envs=16,
                             seed=0, monitor_dir=log_dir)

    env = VecFrameStack(env, n_stack=4)

    agent = AgentLoader.get_agent(agent, config.agents_config, env)

    reward_callback = SaveOnBestTrainingRewardCallback(
        check_freq=100, log_dir=log_dir)

    start_time = time.time()
    steps = 10_000_000
    with ProgressBarManager_new(steps) as progress_callback:
        agent.agent.learn(total_timesteps=steps, callback=[
                          reward_callback, progress_callback])
        # agent.save()
        env.close()

    end_time = time.time() - start_time
    print(f'\n The Training Took {end_time} seconds')
Ejemplo n.º 2
0
def train_and_test_ec(config, video_length_=1000, total_timesteps_=10000):
    print(config)
    if config.atari_wrapper:
        train_env = make_atari_env(config.environment, n_envs=config.workers)
        train_env = VecFrameStack(train_env, n_stack=1)
        shape = (84, 84, 1)
    else:
        train_env = make_vec_env(config.environment, n_envs=config.workers)
        shape = train_env.observation_space.shape

    rnet = RNetwork(shape, config.ensemble_size)
    vec_episodic_memory = [
        EpisodicMemory([64],
                       rnet.embedding_similarity,
                       replacement='random',
                       capacity=200) for _ in range(config.workers)
    ]
    target_image_shape = list(shape)
    #assert type(config.add_stoch) == bool, "Please indicated whether or not you want stoch added"
    train_env = CuriosityEnvWrapper(train_env, vec_episodic_memory,
                                    rnet.embed_observation, target_image_shape,
                                    config.add_stoch)
    r_network_trainer = RNetworkTrainer(rnet,
                                        learning_rate=config.rnet_lr,
                                        observation_history_size=2000,
                                        training_interval=1000)
    train_env.add_observer(r_network_trainer)
    tb_dir = os.path.join(config.log_dir, config.tb_subdir)
    model = config.agent(config.policy_model,
                         train_env,
                         config,
                         verbose=config.verbose,
                         tensorboard_log=tb_dir)

    model.learn(total_timesteps=total_timesteps_)

    print("stopped to learn")
    #model.save("models/"+config.experiment)

    obs = train_env.reset()

    for i in range(video_length_ + 1):

        action, _states = model.predict(obs, deterministic=True)
        obs, reward, done, info = train_env.step(action)
        train_env.render()
        if done.any():
            obs = train_env.reset()

    train_env.close()
Ejemplo n.º 3
0
                real_next_obs[idx] = infos[idx]["terminal_observation"]
        rb.add(obs, real_next_obs, actions, rewards, dones)

        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
        obs = next_obs

        # ALGO LOGIC: training.
        if global_step > args.learning_starts and global_step % args.train_frequency == 0:
            data = rb.sample(args.batch_size)
            with torch.no_grad():
                target_max, _ = target_network.forward(data.next_observations).max(dim=1)
                td_target = data.rewards.flatten() + args.gamma * target_max * (1 - data.dones.flatten())
            old_val = q_network.forward(data.observations).gather(1, data.actions).squeeze()
            loss = loss_fn(td_target, old_val)

            if global_step % 100 == 0:
                writer.add_scalar("losses/td_loss", loss, global_step)

            # optimize the midel
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(list(q_network.parameters()), args.max_grad_norm)
            optimizer.step()

            # update the target network
            if global_step % args.target_network_frequency == 0:
                target_network.load_state_dict(q_network.state_dict())

    envs.close()
    writer.close()