def main():
    ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state"])
    MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state"])
    env = StochasticMDPEnv()
    agent = Agent()
    for episode in range(100):
        print("\n### EPISODE %d ###" % episode)
        state = env.reset()
        done = False
        while not done:
            goal = agent.select_goal(one_hot(state))
            print("New Goal: %d" % goal)
            total_external_reward = 0
            goal_reached = False
            while not done and not goal_reached:
                print(state, end=",")
                action = agent.select_move(one_hot(state), one_hot(goal))
                next_state, external_reward, done = env.step(action)
                intrinsic_reward = agent.criticize(one_hot(state), one_hot(goal), action, one_hot(next_state))
                goal_reached = next_state == goal
                if goal_reached:
                    print("Success!!")
                exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state))
                agent.store(exp, meta=False)
                agent.update(meta=False)
                agent.update(meta=True)
                total_external_reward += external_reward
                state = next_state
            exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state))
            agent.store(exp, meta=True)
def run_architecture(meta_layers, meta_inits, meta_nodes, meta_activations,
            meta_loss, meta_optimizer, layers, inits, nodes, activations, loss,
            optimizer, n_samples, meta_n_samples, gamma, meta_epsilon, k_episodes=12):
    ActorExperience = namedtuple("ActorExperience",
                        ["state", "goal", "action", "reward", "next_state"])
    MetaExperience = namedtuple("MetaExperience",
                        ["state", "goal", "reward", "next_state"])
    env = StochasticMDPEnv()
    agent = hDQN(meta_layers=meta_layers, meta_inits=meta_inits,
                meta_nodes=meta_nodes, meta_activations=meta_activations,
                meta_loss=meta_loss, meta_optimizer=meta_optimizer,
                layers=layers, inits=inits, nodes=nodes, activations=activations,
                meta_n_samples=meta_n_samples, gamma=gamma, meta_epsilon=meta_epsilon)
    #agent = hDQN()
    visits = np.zeros((k_episodes, 6))
    cumulative_regret = 0
    for episode_thousand in range(k_episodes):
        agent.meta_epsilon = agent.meta_epsilon/2.0
        print("\nNew meta-epsilon: %.4f" % agent.meta_epsilon, end="")
        for episode in range(1000):
            print("\n\n### EPISODE %d ###" % (episode_thousand*1000 + episode), end="")
            state = env.reset()
            visits[episode_thousand][state-1] += 1
            done = False
            while not done:
                goal = agent.select_goal(one_hot(state))
                agent.goal_selected[goal-1] += 1
                print("\nNew Goal: %d\nState-Actions: " % goal)
                total_external_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    action = agent.select_move(one_hot(state), one_hot(goal))
                    print((state,action), end="; ")
                    next_state, external_reward, done = env.step(action)
                    visits[episode_thousand][next_state-1] += 1
                    intrinsic_reward = agent.criticize(goal, next_state)
                    goal_reached = next_state == goal
                    if goal_reached:
                        agent.goal_success[goal-1] += 1
                        print("Goal reached!!", end=" ")
                    if next_state == 6:
                        print("S6 reached!!", end=" ")
                    exp = ActorExperience(one_hot(state), one_hot(goal), action,
                                        intrinsic_reward, one_hot(next_state))
                    agent.store(exp, meta=False)
                    agent.update(meta=False)
                    agent.update(meta=True)
                    total_external_reward += external_reward
                    state = next_state
                exp = MetaExperience(one_hot(state), one_hot(goal),
                                    total_external_reward, one_hot(next_state))
                agent.store(exp, meta=True)
            regret = 1.00 - total_external_reward
            print("\nREGRET: ", regret)
            cumulative_regret += regret
            print("CUMULATIVE REGRET: ", cumulative_regret)
            if (episode % 100 == 99):
                print("")
                print(visits/1000, end="")
    return cumulative_regret, visits/1000
Exemple #3
0
def main():
    env = StochasticMDPEnv()
    state = env.current_state
    print("State: %d" % state)
    agent = Agent()
    action = agent.select_move(state)
    state, reward, done = env.step(action)
    while not done:
        print("Action: %d" % action)
        print("Reward: %.2f" % reward)
        print("State: %d" % state)
        action = agent.select_move(state)
        next_state, reward, done = env.step(action)
        agent.update(state, action, reward)
        state = next_state
    print("DONE")
    print(reward)
Exemple #4
0
def main():
    np.set_printoptions(precision=2)
    env = StochasticMDPEnv()
    agent = Agent()
    visits = np.zeros((12, 6))
    for episode_thousand in range(12):
        for episode in range(1000):
            if episode % 1000 == 0 or episode % 500 == 0:
                print("### EPISODE %d ###" %
                      (episode_thousand * 1000 + episode))
            state = env.reset()
            visits[episode_thousand][state - 1] += 1
            action = agent.select_move(one_hot(state))
            state, reward, done = env.step(action)
            visits[episode_thousand][state - 1] += 1
            while not done:
                action = agent.select_move(one_hot(state))
                next_state, reward, done = env.step(action)
                visits[episode_thousand][next_state - 1] += 1
                agent.update(
                    one_hot(state), action,
                    reward + agent.gamma * agent.eval(one_hot(next_state)))
                state = next_state

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.show()
Exemple #5
0
def main():
    ActorExperience = namedtuple(
        "ActorExperience", ["state", "goal", "action", "reward", "next_state"])
    MetaExperience = namedtuple("MetaExperience",
                                ["state", "goal", "reward", "next_state"])
    env = StochasticMDPEnv()
    agent = hDQN()
    visits = np.zeros((12, 6))
    for episode_thousand in range(12):
        agent.meta_epsilon = agent.meta_epsilon / 2.0
        print("\nNew meta-epsilon: %.4f" % agent.meta_epsilon, end="")
        for episode in range(1000):
            print("\n\n### EPISODE %d ###" %
                  (episode_thousand * 1000 + episode),
                  end="")
            state = env.reset()
            visits[episode_thousand][state - 1] += 1
            done = False
            while not done:
                goal = agent.select_goal(one_hot(state))
                agent.goal_selected[goal - 1] += 1
                print("\nNew Goal: %d\nState-Actions: " % goal)
                total_external_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    action = agent.select_move(one_hot(state), one_hot(goal))
                    print((state, action), end="; ")
                    next_state, external_reward, done = env.step(action)
                    visits[episode_thousand][next_state - 1] += 1
                    intrinsic_reward = agent.criticize(goal, next_state)
                    goal_reached = next_state == goal
                    if goal_reached:
                        agent.goal_success[goal - 1] += 1
                        print("Goal reached!!", end=" ")
                    if next_state == 6:
                        print("S6 reached!!", end=" ")
                    exp = ActorExperience(one_hot(state), one_hot(goal),
                                          action, intrinsic_reward,
                                          one_hot(next_state))
                    agent.store(exp, meta=False)
                    agent.update(meta=False)
                    agent.update(meta=True)
                    total_external_reward += external_reward
                    state = next_state
                exp = MetaExperience(one_hot(state),
                                     one_hot(goal), total_external_reward,
                                     one_hot(next_state))
                agent.store(exp, meta=True)
            if (episode % 100 == 99):
                print("")
                print(visits / 1000, end="")

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.show()
Exemple #6
0
    BATCH_SIZE = 128
    GAMMA = 1.0
    REPLAY_MEMORY_SIZE = 1000000
    LEARNING_RATE = 0.00025
    ALPHA = 0.95
    EPS = 0.01

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(50000, 0.1, 1)

    agent = hDQN(
        optimizer_spec=optimizer_spec,
        replay_memory_size=REPLAY_MEMORY_SIZE,
        batch_size=BATCH_SIZE,
    )

    env = StochasticMDPEnv()

    agent, stats, visits = hdqn_learning(
        env=env,
        agent=agent,
        num_episodes=NUM_EPISODES,
        exploration_schedule=exploration_schedule,
        gamma=GAMMA,
    )

    plot_episode_stats(stats)
Exemple #7
0
def main():
    ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    env = StochasticMDPEnv()
    agent = hDQN()
    visits = np.zeros((12, 6))
    anneal_factor = (1.0-0.1)/12000
    print("Annealing factor: " + str(anneal_factor))
    for episode_thousand in range(12):
        for episode in range(1000):
            print("\n\n### EPISODE "  + str(episode_thousand*1000 + episode) + "###")
            state = env.reset()
            visits[episode_thousand][state-1] += 1
            done = False
            while not done:
                goal = agent.select_goal(one_hot(state))
                agent.goal_selected[goal-1] += 1
                print("\nNew Goal: "  + str(goal) + "\nState-Actions: ")
                total_external_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    action = agent.select_move(one_hot(state), one_hot(goal), goal)
                    print(str((state,action)) + "; ")
                    next_state, external_reward, done = env.step(action)
                    visits[episode_thousand][next_state-1] += 1
                    intrinsic_reward = agent.criticize(goal, next_state)
                    goal_reached = next_state == goal
                    if goal_reached:
                        agent.goal_success[goal-1] += 1
                        print("Goal reached!! ")
                    if next_state == 6:
                        print("S6 reached!! ")
                    exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state), done)
                    agent.store(exp, meta=False)
                    agent.update(meta=False)
                    agent.update(meta=True)
                    total_external_reward += external_reward
                    state = next_state
                exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state), done)
                agent.store(exp, meta=True)
                
                #Annealing 
                agent.meta_epsilon -= anneal_factor
                avg_success_rate = agent.goal_success[goal-1] / agent.goal_selected[goal-1]
                
                if(avg_success_rate == 0 or avg_success_rate == 1):
                    agent.actor_epsilon[goal-1] -= anneal_factor
                else:
                    agent.actor_epsilon[goal-1] = 1- avg_success_rate
            
                if(agent.actor_epsilon[goal-1] < 0.1):
                    agent.actor_epsilon[goal-1] = 0.1
                print("meta_epsilon: " + str(agent.meta_epsilon))
                print("actor_epsilon " + str(goal) + ": " + str(agent.actor_epsilon[goal-1]))
                
            if (episode % 100 == 99):
                print("")
                print(str(visits/1000) + "")

    eps = list(range(1,13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:,0]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:,1]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:,2]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:,3]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:,4]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:,5]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.savefig('first_run.png')
    plt.show()
Exemple #8
0
def main():
    np.set_printoptions(precision=2)
    env = StochasticMDPEnv()
    agent = Agent()
    visits = np.zeros((12, 6))
    for episode_thousand in range(12):
        for episode in range(1000):
            done = False
            state = env.reset()
            agent.seen_6 = False
            visits[episode_thousand][state - 1] += 1
            while not done:
                action = agent.select_move(state)
                next_state, reward, done = env.step(action)
                visits[episode_thousand][next_state - 1] += 1
                state = next_state
    print(visits / 1000)

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.show()