Beispiel #1
0
def main():


    
    # ENVIROMENT
    env_name = "CartPole-v1"
    # env_name = "LunarLander-v2"
    env = gym.make(env_name)
    n_actions = env.action_space.n
    feature_dim = env.observation_space.shape[0]

    # PARAMETERS
    learning_rate = 1e-3
    state_scale = 1.0
    reward_scale = 1.0
    clip = 0.2
    n_epoch = 4
    max_episodes = 10
    max_timesteps = 200
    batch_size = 32
    max_iterations = 200
    gamma = 0.99
    gae_lambda = 0.95
    entropy_coefficient = 0.01

    # NETWORK
    value_model = ValueNetwork(in_dim=feature_dim).to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
    
    # INIT
    history = History()
    observation = env.reset()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = -500

    # TENSORBOARD 
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']
    

    for ite in tqdm(range(max_iterations), ascii=True):

        if ite % 5 == 0:
            torch.save({
                'policy_model': policy_model.state_dict(),
                'policy_optimizer': policy_optimizer.state_dict(),
                'value_model': value_model.state_dict(),
                'value_optimizer': value_optimizer.state_dict(),
                'running_reward': running_reward}, model_path)


        
        episode_ite, running_reward = collect(episode_ite, running_reward, env,
                                            max_episodes, max_timesteps, state_scale,
                                            reward_scale, writer, history, policy_model,
                                            value_model, gamma, gae_lambda, device)
        
        # Here we have collected N trajectories.
        history.build_dataset()

        data_loader = DataLoader(history, batch_size=batch_size, shuffle=True, drop_last=True)


        policy_loss, value_loss, train_ite = train_network(data_loader, policy_model, value_model,
                                                        policy_optimizer, value_optimizer ,n_epoch, clip,
                                                        train_ite, writer, entropy_coefficient)


        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        writer.add_scalar("Running Reward", running_reward, epoch_ite)


        if (running_reward > env.spec.reward_threshold):
            print("\nSolved!")
            break
Beispiel #2
0
def main():

    # ENVIROMENT
    # env_name = "CartPole-v1"
    # env_name = "LunarLander-v2"
    # env_name = "Acrobot-v1"
    env_name = "MountainCar-v0"
    env = gym.make(env_name)
    n_actions = env.action_space.n
    feature_dim = env.observation_space.shape[0]

    # PARAMETERS
    learning_rate = 1e-3
    state_scale = 1.0
    reward_scale = 1.0
    clip = 0.2
    n_epoch = 4
    max_episodes = 10
    max_timesteps = 100
    batch_size = 32
    max_iterations = 1000
    gamma = 0.99
    gae_lambda = 0.95
    entropy_coefficient = 0.01
    env_threshold = env.spec.reward_threshold

    # NETWORK
    value_model = ValueNetwork(in_dim=feature_dim).to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork(in_dim=feature_dim, n=n_actions).to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

    # INIT
    history = History()
    observation = env.reset()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = -500

    # TENSORBOARD
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']

    EnvQueue = queue.SimpleQueue()

    for _ in range(max_episodes):
        env = gym.make(env_name)
        observation = env.reset()
        EnvQueue.put((env, observation, 0))

    for ite in tqdm(range(max_iterations), ascii=True):

        if ite % 5 == 0:
            torch.save(
                {
                    'policy_model': policy_model.state_dict(),
                    'policy_optimizer': policy_optimizer.state_dict(),
                    'value_model': value_model.state_dict(),
                    'value_optimizer': value_optimizer.state_dict(),
                    'running_reward': running_reward
                }, model_path)

        q = queue.SimpleQueue()

        env_list = []
        while not EnvQueue.empty():
            env_list.append(EnvQueue.get())

        threads = []
        for env in env_list:
            t = threading.Thread(target=collect,
                                 args=[
                                     q, env_name, env, EnvQueue, max_timesteps,
                                     state_scale, reward_scale, policy_model,
                                     value_model, gamma, gae_lambda, device
                                 ])
            t.start()
            threads.append(t)

        for t in threads:
            t.join()

        avg_episode_reward = []
        # Write all episodes from queue to history buffer
        while not q.empty():
            episode, done = q.get()
            history.episodes.append(episode)
            avg_episode_reward.append((episode.reward, done))

        for ep_reward, done in avg_episode_reward:
            if done:
                running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
                writer.add_scalar("Running Reward", running_reward,
                                  episode_ite)
                writer.add_scalar("Episode Reward", ep_reward, episode_ite)
                episode_ite += 1

        # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward)

        # Here we have collected N trajectories and prepare dataset
        history.build_dataset()

        data_loader = DataLoader(history,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 drop_last=True)

        policy_loss, value_loss, train_ite = train_network(
            data_loader, policy_model, value_model, policy_optimizer,
            value_optimizer, n_epoch, clip, train_ite, writer,
            entropy_coefficient)

        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        if (running_reward > env_threshold):
            print("\nSolved!")
            break
Beispiel #3
0
def main(env_name, lr, state_scale, reward_scale, clip, train_epoch,
         max_episodes, max_timesteps, batch_size, max_iterations, gamma,
         gae_lambda, entropy_coefficient, start_running_reward, update_rate):

    # ENVIROMENT
    env_name = env_name
    env = ChessEnv()

    # PARAMETERS
    learning_rate = lr
    state_scale = state_scale
    reward_scale = reward_scale
    clip = clip
    n_epoch = train_epoch
    max_episodes = max_episodes
    max_timesteps = max_timesteps
    batch_size = batch_size
    max_iterations = max_iterations
    gamma = gamma
    gae_lambda = gae_lambda
    entropy_coefficient = entropy_coefficient

    # NETWORK
    value_model = ValueNetwork().to(device)
    value_optimizer = optim.Adam(value_model.parameters(), lr=learning_rate)

    policy_model = PolicyNetwork().to(device)
    policy_optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)

    # INIT
    history = History()

    epoch_ite = 0
    episode_ite = 0
    train_ite = 0
    running_reward = start_running_reward

    # TENSORBOARD
    timestr = time.strftime("%d%m%Y-%H%M%S-")

    log_dir = "./runs/" + timestr + env_name + "-BS" + str(batch_size) + "-E" + \
            str(max_episodes) + "-MT" + str(max_timesteps) + "-NE" + str(n_epoch) + \
            "-LR" + str(learning_rate) + "-G" + str(gamma) + "-L" + str(gae_lambda)

    writer = SummaryWriter(log_dir=log_dir)

    # LOAD MODEL
    # Create folder models
    if not Path("./models").exists():
        print("Creating Models folder")
        Path("./models").mkdir()

    model_path = Path("./models/" + env_name + ".tar")
    if model_path.exists():
        print("Loading model!")
        #Load model
        checkpoint = torch.load(model_path)
        policy_model.load_state_dict(checkpoint['policy_model'])
        policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        value_model.load_state_dict(checkpoint['value_model'])
        value_optimizer.load_state_dict(checkpoint['value_optimizer'])
        running_reward = checkpoint['running_reward']

    # Create SavedEnvs queue
    SavedEnv = queue.SimpleQueue()
    for _ in range(max_episodes):
        env = ChessEnv()
        SavedEnv.put((env, env.reset(), 0))

    # START ITERATING
    for ite in tqdm(range(max_iterations), ascii=True):
        # Load model to rival each update_rate epochs
        if ite % update_rate == 0:
            print("\nUpdating")
            rival_policy = PolicyNetwork().to(device)
            rival_policy.load_state_dict(policy_model.state_dict())

        if ite % 5 == 0:
            torch.save(
                {
                    'policy_model': policy_model.state_dict(),
                    'policy_optimizer': policy_optimizer.state_dict(),
                    'value_model': value_model.state_dict(),
                    'value_optimizer': value_optimizer.state_dict(),
                    'running_reward': running_reward
                }, model_path)

        print("\nSimulating")
        start_simulation = time.perf_counter()

        q = queue.SimpleQueue()

        env_list = []
        while not SavedEnv.empty():
            env_list.append(SavedEnv.get())

        threads = []
        for saved_env in env_list:
            t = threading.Thread(target=collect,
                                 args=[
                                     q, env_name, saved_env, SavedEnv,
                                     max_timesteps, state_scale, reward_scale,
                                     policy_model, value_model, gamma,
                                     gae_lambda, device, rival_policy
                                 ])
            t.start()
            threads.append(t)

        for t in threads:
            t.join()

        # for saved_env in env_list:
        #     if ite % 20 == 0:
        #         update_policy = True
        #     else:
        #         update_policy = False
        #     collect(q, env_name, saved_env,
        #                         SavedEnv, max_timesteps, state_scale, reward_scale,
        #                         policy_model, value_model, gamma,
        #                         gae_lambda, device, update_policy)

        avg_episode_reward = []
        # Write all episodes from queue to history buffer
        while not q.empty():
            episode, done = q.get()
            history.episodes.append(episode)
            avg_episode_reward.append((episode.reward, done))

        end_simulation = time.perf_counter()
        print(f"Simulation time: {end_simulation-start_simulation:.2f} ")

        for ep_reward, done in avg_episode_reward:
            if done:
                running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

                writer.add_scalar("Average Episode Reward", ep_reward,
                                  episode_ite)
                episode_ite += 1

        # avg_ep_reward = sum(avg_episode_reward) / len(avg_episode_reward)

        # Here we have collected N trajectories and prepare dataset
        history.build_dataset()

        data_loader = DataLoader(history,
                                 batch_size=batch_size,
                                 shuffle=True,
                                 drop_last=True)

        print("Training")
        policy_loss, value_loss, train_ite = train_network(
            data_loader, policy_model, value_model, policy_optimizer,
            value_optimizer, n_epoch, clip, train_ite, writer,
            entropy_coefficient)

        end_training = time.perf_counter()
        print(f"Training time: {end_training-end_simulation:.2f}")

        for p_l, v_l in zip(policy_loss, value_loss):
            epoch_ite += 1
            writer.add_scalar("Policy Loss", p_l, epoch_ite)
            writer.add_scalar("Value Loss", v_l, epoch_ite)

        history.free_memory()

        # print("\n", running_reward)

        writer.add_scalar("Running Reward", running_reward, epoch_ite)

        if (running_reward > 0):
            print("\nSolved!")
            break
Beispiel #4
0
    model.load_state_dict(checkpoint['policy_model'])
    optimizer.load_state_dict(checkpoint['policy_optimizer'])

for epoch in tqdm(range(1, num_epochs + 1)):
    tr_loss = train_epoch(train_loader, model, optimizer, criterion)
    tr_losses.append(tr_loss)
    te_loss, te_acc = test_epoch(test_loader, model)
    te_losses.append(te_loss)
    te_accs.append(te_acc)

    writer.add_scalar("Test loss", te_loss, epoch)
    writer.add_scalar("Test accuracy", te_acc, epoch)

    torch.save(
        {
            'policy_model': model.state_dict(),
            'policy_optimizer': optimizer.state_dict()
        }, model_path)

# plt.figure(figsize=(10, 8))
# plt.subplot(2,1,1)
# plt.xlabel('Epoch')
# plt.ylabel('NLLLoss')
# plt.plot(tr_losses, label='train')
# plt.show()
# plt.plot(te_losses, label='test')
# plt.show()
# plt.legend()
# plt.subplot(2,1,2)
# plt.xlabel('Epoch')
# plt.ylabel('Test Accuracy [%]')