コード例 #1
0
ファイル: main.py プロジェクト: Akhilez/ml_api
async def index(algo: str):
    env = GridWorldEnv(grid_size, mode="random")
    env.reset()
    player, win, pit, wall = GridWorldBase.get_item_positions(env.state)
    model = models[algo]
    predictions = model.predict(env)
    return {
        "state": {"player": player, "wall": wall, "win": win, "pit": pit},
        "grid_size": grid_size,
        "predictions": predictions,
    }
コード例 #2
0
ファイル: gw_alpha.py プロジェクト: Akhilez/reward_lab
def rollout(state: np.ndarray,
            attempt: int = 0,
            max_steps: int = None,
            **env_kwargs) -> float:
    if attempt >= max_steps:
        return 0
    if GridWorldEnv.is_done(state):
        if GridWorldEnv.has_won(state):
            return 1
        return 0
    env_ = GridWorldEnv(**env_kwargs)
    env_.reset()
    env_.set_state(state)
    state, _, _, _ = env_.step(np.random.choice(range(4)))
    return rollout(state, attempt + 1, max_steps, **env_kwargs)
コード例 #3
0
ファイル: gw_pg.py プロジェクト: Akhilez/reward_lab
def play(model, cfg):
    env = GridWorldEnv(cfg.grid_size, cfg.env_mode)
    env.reset()
    env.render()
    step = 0

    while not env.done and step < cfg.max_steps:
        x = GWPgModel.convert_inputs([env])
        yh = model(x)
        yh = F.softmax(yh, 1)
        action = yh[0].argmax(0)

        _, reward, done, _ = env.step(action)

        env.render()
        step += 1
コード例 #4
0
ファイル: gw_mcts.py プロジェクト: Akhilez/reward_lab
def rollout(state: np.ndarray, attempt: int = 0) -> float:
    if attempt >= MAX_STEPS:
        return 0
    if env.is_done(state):
        if env.has_won(state):
            return 1
        return 0
    env_ = GridWorldEnv(size, mode)
    env_.reset()
    env_.set_state(state)
    state, _, _, _ = env_.step(np.random.choice(range(4)))
    return rollout(state, attempt + 1)
コード例 #5
0
ファイル: gw_mcts.py プロジェクト: Akhilez/reward_lab
    def __init__(self, parent=None, action: int = None):
        self.parent = parent
        self.children = None
        self.action = action
        self.n = 0
        self.wins = 0
        self.is_terminal = False

        temp_env = GridWorldEnv(size, mode)
        temp_env.reset()

        self.state: np.ndarray = (copy.deepcopy(parent.state)
                                  if parent is not None else temp_env.state)

        if action is not None:
            temp_env.set_state(self.state)
            self.state, _, self.is_terminal, _ = temp_env.step(action)
コード例 #6
0
ファイル: main.py プロジェクト: Akhilez/ml_api
async def step(algo: str, data: StepData):
    env = GridWorldEnv(grid_size, mode="random")
    env.reset()
    env.state = dict(data.positions)
    state, reward, done, info = env.step(data.action)
    model = models[algo]
    predictions = model.predict(env)
    player, win, pit, wall = GridWorldBase.get_item_positions(state)
    return {
        "reward": reward,
        "done": done,
        "info": info,
        "predictions": predictions,
        "state": {"player": player, "wall": wall, "win": win, "pit": pit},
    }
コード例 #7
0
ファイル: gw_pg.py プロジェクト: Akhilez/reward_lab
    def __init__(self, **config):
        self.cfg = OmegaConf.create(config)

        self.model = (GWPgModel(
            self.cfg.grid_size,
            [self.cfg.units
             for _ in range(self.cfg.depth)]).double().to(device))
        self.optim = torch.optim.Adam(self.model.parameters(), lr=self.cfg.lr)
        self.writer = SummaryWriter(
            f"{CWD}/runs/gw_policy_grad_LR{str(self.cfg.lr)[:7]}_{self.cfg.depth}x{self.cfg.units}_{int(datetime.now().timestamp())}"
        )
        self.envs = [
            GridWorldEnv(size=self.cfg.grid_size, mode=self.cfg.env_mode)
            for _ in range(self.cfg.n_env)
        ]
        self.stats_e = []
        self.won = []
        self.current_episode = 1

        self.reset_episode()
        self.writer.add_graph(self.model, GWPgModel.convert_inputs(self.envs))
コード例 #8
0
ファイル: train_gridworld.py プロジェクト: Akhilez/reward_lab
 def __init__(self):
     super().__init__(GridWorldEnv(size=4, mode="random"))
コード例 #9
0
ファイル: gw_alpha.py プロジェクト: Akhilez/reward_lab
def expand(node: MctsNode) -> MctsNode:
    if GridWorldEnv.is_done(node.state):
        return node
    if node.children is None:
        node.children = [MctsNode(node, action) for action in range(4)]
    return np.random.choice(node.children)
コード例 #10
0
ファイル: gw_alpha.py プロジェクト: Akhilez/reward_lab
def main():

    # ----------------- Hyper params -------------------

    # Env params
    GRID_SIZE = 4
    ENV_MODE = "random"

    # TRAINING_PARAMS
    EPOCHS = 10000
    BATCH_SIZE = 1
    MAX_MONTE_CARLO_STEPS = 50
    N_TRAIN_STEP = 4
    ARCHITECTURE = [50, 50]
    GAMMA_RETURNS = 0.75
    GAMMA_CREDITS = 0.75
    LEARNING_RATE = 1e-3

    # -------------- Setup other variables ----------------

    model = GwAcModel(GRID_SIZE, ARCHITECTURE).double().to(device)
    optim = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    envs = [
        GridWorldEnv(size=GRID_SIZE, mode=ENV_MODE) for _ in range(BATCH_SIZE)
    ]
    global_step = 0
    timestamp = int(datetime.now().timestamp())
    writer = SummaryWriter(
        f"{CWD}/runs/gw_ac_LR{str(LEARNING_RATE)[:7]}_{timestamp}")

    # Add model graph
    envs[0].reset()
    writer.add_graph(model, GwAcModel.convert_inputs(envs[:1]))

    # -------------- Training loop ----------------

    for epoch in range(EPOCHS):

        [env.reset() for env in envs]

        stats: List[List[Dict[str, Union[torch.Tensor,
                                         float]]]] = [[] for _ in envs]
        step = 0

        episode_rewards = []

        tree = MctsNode()
        expand(tree)

        # -------------- Monte Carlo Loop ---------------------
        while True:
            """

            1. predict for node when you create the node
            2. During expansion, filter the children
            3. During selection, only go deep enough

            """

            # ----------- Predict policy and value -------------
            states = GwAcModel.convert_inputs(envs)
            policy, value = model(
                states)  # Shapes: ph: (batch, 4); vh: (batch, 1)

            # ------------ Sample actions -----------------
            tau = max((1 / (np.log(epoch) + 0.0001) * 5), 0.7)
            writer.add_scalar("tau", tau, global_step=global_step)
            policy = F.gumbel_softmax(policy, tau=tau, dim=1)
            actions = torch.multinomial(policy, 1).squeeze()  # shape: (batch)

            # ------------- Rewards from step ----------------
            for i in range(BATCH_SIZE):
                if not envs[i].done:
                    _, reward, _, _ = envs[i].step(actions[i])
                    stats[i].append({
                        "reward": reward,
                        "value": value[i][0],
                        "policy": policy[i][actions[i]],
                    })
                    episode_rewards.append(reward)

            # -------------- Termination conditions ------------------

            all_done = all([env.done for env in envs])
            has_timed_out = step >= MAX_MONTE_CARLO_STEPS
            n_step_ended = step % N_TRAIN_STEP == 0

            if has_timed_out:
                # Set unfinished env's reward to -10
                for i in range(BATCH_SIZE):
                    if not envs[i].done:
                        stats[i][-1]["reward"] = -10
                all_done = True

            if all_done or n_step_ended:
                # ----------- Add last state's value -------------
                states = GwAcModel.convert_inputs(envs)

                model.eval()
                with torch.no_grad():
                    _, value = model(states)
                model.train()

                for i in range(BATCH_SIZE):
                    stats[i].append({"value": value[i][0]})

                # -------------- LEARN -----------------
                # loss = naive_ac_loss(stats, GAMMA_RETURNS, GAMMA_CREDITS)
                loss = advantage_ac_loss(stats, GAMMA_RETURNS)

                optim.zero_grad()
                loss.backward()
                optim.step()

                # ------------ Logging ----------------
                writer.add_scalar("Training loss",
                                  loss.item(),
                                  global_step=global_step)
                global_step += 1

                # Clean up
                stats = [[] for _ in envs]

            if all_done:
                break
            writer.add_scalar("Mean Rewards",
                              np.mean(episode_rewards),
                              global_step=global_step)
            step += 1

        print(".", end="")

    save_model(model, CWD, "grid_world_ac")
コード例 #11
0
ファイル: gw_mcts.py プロジェクト: Akhilez/reward_lab
import numpy as np
import copy
import torch

from gym_grid_world.envs import GridWorldEnv

size = 4
mode = "static"
MAX_STEPS = 50

env = GridWorldEnv(size=size, mode=mode)


class MctsNode:
    def __init__(self, parent=None, action: int = None):
        self.parent = parent
        self.children = None
        self.action = action
        self.n = 0
        self.wins = 0
        self.is_terminal = False

        temp_env = GridWorldEnv(size, mode)
        temp_env.reset()

        self.state: np.ndarray = (copy.deepcopy(parent.state)
                                  if parent is not None else temp_env.state)

        if action is not None:
            temp_env.set_state(self.state)
            self.state, _, self.is_terminal, _ = temp_env.step(action)
コード例 #12
0
ファイル: per_q.py プロジェクト: Akhilez/reward_lab
def main_single_batch():

    # ============= INITIALIZE VARIABLES ===================

    grid_size = 4
    epsilon = 0.1
    gamma = 0.7
    n_episodes = 5000
    max_steps = 50
    replay_batch_size = 199
    max_buffer_size = 1000
    lr = 0.01
    mode = "random"
    architecture = [50]

    writer = SummaryWriter(
        f"{CWD}/runs/gw_PER_q_LR{str(lr)[:7]}_{mode}_{int(datetime.now().timestamp())}"
    )
    env = GridWorldEnv(size=grid_size, mode=mode)
    experiences = PrioritizedReplay(max_size=max_buffer_size)

    model = GWPgModel(size=grid_size, units=architecture).double().to(device)
    optim = torch.optim.Adam(model.parameters(), lr=lr)

    # ============== TRAINING LOOP ===========================

    for epoch in range(n_episodes):
        env.reset()
        step = 0
        losses = []
        all_rewards = []

        while not env.done and step < max_steps:
            # Store state for experience replay
            state = env.state

            # =============== Collect experiences =================

            envs = [env]

            exp_samples = experiences.sample(replay_batch_size)
            for exp in exp_samples:
                new_env = GridWorldEnv(size=grid_size, mode=mode)
                new_env.reset()
                new_env.state = state_to_dict(exp[1])
                envs.append(new_env)

            x = model.convert_inputs(envs)
            x = x + torch.rand_like(x) / 100

            # =======================================================

            y = model(x)

            rewards = []
            qhs = []
            for i in range(len(envs)):
                # =========== Epsilon Probability ==============

                use_rand = torch.rand(1)[0] < epsilon
                action = (
                    torch.randint(0, 4, (1,))[0] if use_rand else torch.argmax(y[i], 0)
                )
                qh = y[i][action]

                # ============ Observe the reward && predict value of next state ==============

                _, reward, _, _ = envs[i].step(int(action))
                rewards.append(reward)
                qhs.append(qh)
            rewards = torch.tensor(rewards).double().to(device)
            qhs = torch.stack(qhs)

            with torch.no_grad():
                model.eval()
                x_next = model.convert_inputs(envs)
                y_next = model(x_next)
                q_next, _ = torch.max(y_next, dim=1)
                model.train()

            q = rewards + gamma * q_next

            # =========== LEARN ===============

            loss = (qhs - q) ** 2

            experiences.put([(loss[0].item(), state)])
            losses.append(loss[0].item())
            all_rewards.append(rewards[0])

            loss = torch.mean(loss)
            optim.zero_grad()
            loss.backward()
            optim.step()

            step += 1

        writer.add_scalar("loss", np.mean(losses), global_step=epoch)
        writer.add_scalar("reward", np.mean(all_rewards), global_step=epoch)
        writer.add_scalar("episode_len", len(losses), global_step=epoch)
        print(".", end="")
コード例 #13
0
def main_single_batch():

    # ============= INITIALIZE VARIABLES ===================

    grid_size = 4
    epsilon = 0.1
    gamma = 0.95
    n_episodes = 15000
    max_steps = 50
    replay_batch_size = 199
    max_buffer_size = 1000
    sync_freq = 250  # NEW
    lr = 0.001
    mode = "random"
    architecture = [50]

    writer = SummaryWriter(
        f"{CWD}/runs/gw_target_network_q_LR{str(lr)[:7]}_{mode}_{int(datetime.now().timestamp())}"
    )
    env = GridWorldEnv(size=grid_size, mode=mode)
    experiences = PrioritizedReplay(max_size=max_buffer_size)

    model = GWPgModel(size=grid_size, units=architecture).double().to(device)

    model2 = deepcopy(model)
    model2.load_state_dict(model.state_dict())
    model2.eval()

    optim = torch.optim.Adam(model.parameters(), lr=lr)

    global_step = 0

    # ============== TRAINING LOOP ===========================

    for epoch in range(n_episodes):
        env.reset()
        step = 0

        while not env.done and step <= max_steps:
            # Store state for experience replay
            state = env.state

            # =============== Collect experiences =================

            envs = [env]

            exp_samples = experiences.sample(replay_batch_size)
            for exp in exp_samples:
                new_env = GridWorldEnv(size=grid_size, mode=mode)
                new_env.reset()
                new_env.state = state_to_dict(exp[1])
                envs.append(new_env)

            x = model.convert_inputs(envs)
            x = x + torch.rand_like(x) / 100

            # =======================================================

            y = model(x)

            rewards = []
            qhs = []
            for i in range(len(envs)):
                # =========== Epsilon Probability ==============

                use_rand = torch.rand(1)[0] < epsilon
                action = (
                    torch.randint(0, 4, (1,))[0] if use_rand else torch.argmax(y[i], 0)
                )
                qh = y[i][action]

                # ============ Observe the reward && predict value of next state ==============

                _, reward, _, _ = envs[i].step(int(action))
                rewards.append(reward)
                qhs.append(qh)

            if step >= max_steps:
                rewards[0] = -10
                env.done = True
            rewards = torch.tensor(rewards).double().to(device)
            qhs = torch.stack(qhs)

            x_next = model2.convert_inputs(envs)
            with torch.no_grad():
                y_next = model2(x_next)
            q_next, _ = torch.max(y_next, dim=1)

            q = rewards + gamma * q_next

            # =========== LEARN ===============

            loss = (qhs - q) ** 2

            loss = torch.mean(loss)
            optim.zero_grad()
            loss.backward()
            optim.step()

            step += 1
            global_step += 1

            if global_step % sync_freq == 0:
                model2.load_state_dict(model.state_dict())

            writer.add_scalar("loss", loss.item(), global_step=global_step)
            writer.add_scalar("reward", rewards[0].item(), global_step=global_step)
        # writer.add_scalar("episode_len", step, global_step=global_step)
        print(".", end="")

    save_model(model, CWD + "/..", "grid_world_q")
コード例 #14
0
ファイル: vanilla_pg.py プロジェクト: Akhilez/reward_lab
gamma_returns = 0.80
gamma_credits = 0.95

total_episodes = 1000
n_env = 50
max_steps = 100

grid_size = 4
env_mode = "random"

# -----------------------------------------------------

model = GwAcModel(grid_size, [units for _ in range(depth)]).double().to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)

envs = [GridWorldEnv(size=grid_size, mode=env_mode) for _ in range(n_env)]

current_episode = 1
stats_e = None
won = None

writer = SummaryWriter(
    f"{CWD}/runs/gw_policy_grad_LR{str(lr)[:7]}_{depth}x{units}_{int(datetime.now().timestamp())}"
)

envs[0].reset()
writer.add_graph(model, GwAcModel.convert_inputs(envs[:1]))

# -----------------------------------------------------

コード例 #15
0
ファイル: vanilla_q.py プロジェクト: Akhilez/reward_lab
def main_single_batch():

    # ============= INITIALIZE VARIABLES ===================

    grid_size = 4
    epsilon = 0.1
    gamma = 0.9

    n_episodes = 10000
    max_steps = 150

    lr = 0.01

    mode = "player"

    writer = SummaryWriter(
        f"{CWD}/runs/gw_vanilla_q_LR{str(lr)[:7]}_{mode}_{int(datetime.now().timestamp())}"
    )

    env = GridWorldEnv(size=grid_size, mode=mode)

    model = GWPgModel(size=grid_size, units=[10]).double().to(device)
    optim = torch.optim.Adam(model.parameters(), lr=lr)

    # ============== TRAINING LOOP ===========================

    for epoch in range(n_episodes):
        env.reset()
        step = 0
        losses = []
        rewards = []

        # Monte Carlo loop
        while not env.done and step < max_steps:
            x = model.convert_inputs([env])
            y = model(x)

            # =========== Epsilon Probability ==============

            if torch.rand(1) < epsilon:
                action = torch.randint(0, 4, (1, ))
                qh = y[0][action]
            else:
                action = torch.argmax(y, 1)
                qh = y[0][action]

            # ============ Observe the reward && predict value of next state ==============

            _, reward, _, _ = env.step(int(action))

            with torch.no_grad():
                model.eval()
                q_next, _ = torch.max(model(model.convert_inputs([env]))[0],
                                      dim=0)
                model.train()

            q = reward + gamma * q_next

            # =========== LEARN ===============

            loss = (qh - q)**2

            losses.append(loss.item())
            rewards.append(reward)

            optim.zero_grad()
            loss.backward()
            optim.step()

            step += 1

        writer.add_scalar("loss", np.mean(losses), global_step=epoch)
        writer.add_scalar("reward", np.mean(rewards), global_step=epoch)
        writer.add_scalar("episode_len", len(losses), global_step=epoch)
        print(".", end="")