Python StochasticMDPEnv.resetの例、envs.mdp.StochasticMDPEnv.reset Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_naive_hierarchy.py プロジェクト: zeyuan1987/h-DQN

def main():
    ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state"])
    MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state"])
    env = StochasticMDPEnv()
    agent = Agent()
    for episode in range(100):
        print("\n### EPISODE %d ###" % episode)
        state = env.reset()
        done = False
        while not done:
            goal = agent.select_goal(one_hot(state))
            print("New Goal: %d" % goal)
            total_external_reward = 0
            goal_reached = False
            while not done and not goal_reached:
                print(state, end=",")
                action = agent.select_move(one_hot(state), one_hot(goal))
                next_state, external_reward, done = env.step(action)
                intrinsic_reward = agent.criticize(one_hot(state), one_hot(goal), action, one_hot(next_state))
                goal_reached = next_state == goal
                if goal_reached:
                    print("Success!!")
                exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state))
                agent.store(exp, meta=False)
                agent.update(meta=False)
                agent.update(meta=True)
                total_external_reward += external_reward
                state = next_state
            exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state))
            agent.store(exp, meta=True)

コード例 #2

0

ファイルを表示

ファイル: search_architectures.py プロジェクト: zencoding/h-DQN

def run_architecture(meta_layers, meta_inits, meta_nodes, meta_activations,
            meta_loss, meta_optimizer, layers, inits, nodes, activations, loss,
            optimizer, n_samples, meta_n_samples, gamma, meta_epsilon, k_episodes=12):
    ActorExperience = namedtuple("ActorExperience",
                        ["state", "goal", "action", "reward", "next_state"])
    MetaExperience = namedtuple("MetaExperience",
                        ["state", "goal", "reward", "next_state"])
    env = StochasticMDPEnv()
    agent = hDQN(meta_layers=meta_layers, meta_inits=meta_inits,
                meta_nodes=meta_nodes, meta_activations=meta_activations,
                meta_loss=meta_loss, meta_optimizer=meta_optimizer,
                layers=layers, inits=inits, nodes=nodes, activations=activations,
                meta_n_samples=meta_n_samples, gamma=gamma, meta_epsilon=meta_epsilon)
    #agent = hDQN()
    visits = np.zeros((k_episodes, 6))
    cumulative_regret = 0
    for episode_thousand in range(k_episodes):
        agent.meta_epsilon = agent.meta_epsilon/2.0
        print("\nNew meta-epsilon: %.4f" % agent.meta_epsilon, end="")
        for episode in range(1000):
            print("\n\n### EPISODE %d ###" % (episode_thousand*1000 + episode), end="")
            state = env.reset()
            visits[episode_thousand][state-1] += 1
            done = False
            while not done:
                goal = agent.select_goal(one_hot(state))
                agent.goal_selected[goal-1] += 1
                print("\nNew Goal: %d\nState-Actions: " % goal)
                total_external_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    action = agent.select_move(one_hot(state), one_hot(goal))
                    print((state,action), end="; ")
                    next_state, external_reward, done = env.step(action)
                    visits[episode_thousand][next_state-1] += 1
                    intrinsic_reward = agent.criticize(goal, next_state)
                    goal_reached = next_state == goal
                    if goal_reached:
                        agent.goal_success[goal-1] += 1
                        print("Goal reached!!", end=" ")
                    if next_state == 6:
                        print("S6 reached!!", end=" ")
                    exp = ActorExperience(one_hot(state), one_hot(goal), action,
                                        intrinsic_reward, one_hot(next_state))
                    agent.store(exp, meta=False)
                    agent.update(meta=False)
                    agent.update(meta=True)
                    total_external_reward += external_reward
                    state = next_state
                exp = MetaExperience(one_hot(state), one_hot(goal),
                                    total_external_reward, one_hot(next_state))
                agent.store(exp, meta=True)
            regret = 1.00 - total_external_reward
            print("\nREGRET: ", regret)
            cumulative_regret += regret
            print("CUMULATIVE REGRET: ", cumulative_regret)
            if (episode % 100 == 99):
                print("")
                print(visits/1000, end="")
    return cumulative_regret, visits/1000

コード例 #3

0

ファイルを表示

def main():
    np.set_printoptions(precision=2)
    env = StochasticMDPEnv()
    agent = Agent()
    visits = np.zeros((12, 6))
    for episode_thousand in range(12):
        for episode in range(1000):
            if episode % 1000 == 0 or episode % 500 == 0:
                print("### EPISODE %d ###" %
                      (episode_thousand * 1000 + episode))
            state = env.reset()
            visits[episode_thousand][state - 1] += 1
            action = agent.select_move(one_hot(state))
            state, reward, done = env.step(action)
            visits[episode_thousand][state - 1] += 1
            while not done:
                action = agent.select_move(one_hot(state))
                next_state, reward, done = env.step(action)
                visits[episode_thousand][next_state - 1] += 1
                agent.update(
                    one_hot(state), action,
                    reward + agent.gamma * agent.eval(one_hot(next_state)))
                state = next_state

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 1.1)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.show()

コード例 #4

0

ファイルを表示

ファイル: run.py プロジェクト: zencoding/h-DQN

def main():
    ActorExperience = namedtuple(
        "ActorExperience", ["state", "goal", "action", "reward", "next_state"])
    MetaExperience = namedtuple("MetaExperience",
                                ["state", "goal", "reward", "next_state"])
    env = StochasticMDPEnv()
    agent = hDQN()
    visits = np.zeros((12, 6))
    for episode_thousand in range(12):
        agent.meta_epsilon = agent.meta_epsilon / 2.0
        print("\nNew meta-epsilon: %.4f" % agent.meta_epsilon, end="")
        for episode in range(1000):
            print("\n\n### EPISODE %d ###" %
                  (episode_thousand * 1000 + episode),
                  end="")
            state = env.reset()
            visits[episode_thousand][state - 1] += 1
            done = False
            while not done:
                goal = agent.select_goal(one_hot(state))
                agent.goal_selected[goal - 1] += 1
                print("\nNew Goal: %d\nState-Actions: " % goal)
                total_external_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    action = agent.select_move(one_hot(state), one_hot(goal))
                    print((state, action), end="; ")
                    next_state, external_reward, done = env.step(action)
                    visits[episode_thousand][next_state - 1] += 1
                    intrinsic_reward = agent.criticize(goal, next_state)
                    goal_reached = next_state == goal
                    if goal_reached:
                        agent.goal_success[goal - 1] += 1
                        print("Goal reached!!", end=" ")
                    if next_state == 6:
                        print("S6 reached!!", end=" ")
                    exp = ActorExperience(one_hot(state), one_hot(goal),
                                          action, intrinsic_reward,
                                          one_hot(next_state))
                    agent.store(exp, meta=False)
                    agent.update(meta=False)
                    agent.update(meta=True)
                    total_external_reward += external_reward
                    state = next_state
                exp = MetaExperience(one_hot(state),
                                     one_hot(goal), total_external_reward,
                                     one_hot(next_state))
                agent.store(exp, meta=True)
            if (episode % 100 == 99):
                print("")
                print(visits / 1000, end="")

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.show()

コード例 #5

0

ファイルを表示

ファイル: run.py プロジェクト: xiaoxianSun/h-DQN

def main():
    ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"])
    MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state", "done"])
    env = StochasticMDPEnv()
    agent = hDQN()
    visits = np.zeros((12, 6))
    anneal_factor = (1.0-0.1)/12000
    print("Annealing factor: " + str(anneal_factor))
    for episode_thousand in range(12):
        for episode in range(1000):
            print("\n\n### EPISODE "  + str(episode_thousand*1000 + episode) + "###")
            state = env.reset()
            visits[episode_thousand][state-1] += 1
            done = False
            while not done:
                goal = agent.select_goal(one_hot(state))
                agent.goal_selected[goal-1] += 1
                print("\nNew Goal: "  + str(goal) + "\nState-Actions: ")
                total_external_reward = 0
                goal_reached = False
                while not done and not goal_reached:
                    action = agent.select_move(one_hot(state), one_hot(goal), goal)
                    print(str((state,action)) + "; ")
                    next_state, external_reward, done = env.step(action)
                    visits[episode_thousand][next_state-1] += 1
                    intrinsic_reward = agent.criticize(goal, next_state)
                    goal_reached = next_state == goal
                    if goal_reached:
                        agent.goal_success[goal-1] += 1
                        print("Goal reached!! ")
                    if next_state == 6:
                        print("S6 reached!! ")
                    exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state), done)
                    agent.store(exp, meta=False)
                    agent.update(meta=False)
                    agent.update(meta=True)
                    total_external_reward += external_reward
                    state = next_state
                exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state), done)
                agent.store(exp, meta=True)
                
                #Annealing 
                agent.meta_epsilon -= anneal_factor
                avg_success_rate = agent.goal_success[goal-1] / agent.goal_selected[goal-1]
                
                if(avg_success_rate == 0 or avg_success_rate == 1):
                    agent.actor_epsilon[goal-1] -= anneal_factor
                else:
                    agent.actor_epsilon[goal-1] = 1- avg_success_rate
            
                if(agent.actor_epsilon[goal-1] < 0.1):
                    agent.actor_epsilon[goal-1] = 0.1
                print("meta_epsilon: " + str(agent.meta_epsilon))
                print("actor_epsilon " + str(goal) + ": " + str(agent.actor_epsilon[goal-1]))
                
            if (episode % 100 == 99):
                print("")
                print(str(visits/1000) + "")

    eps = list(range(1,13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:,0]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:,1]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:,2]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:,3]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:,4]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:,5]/1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.savefig('first_run.png')
    plt.show()

コード例 #6

0

ファイルを表示

ファイル: test_mdp.py プロジェクト: zencoding/h-DQN

def main():
    np.set_printoptions(precision=2)
    env = StochasticMDPEnv()
    agent = Agent()
    visits = np.zeros((12, 6))
    for episode_thousand in range(12):
        for episode in range(1000):
            done = False
            state = env.reset()
            agent.seen_6 = False
            visits[episode_thousand][state - 1] += 1
            while not done:
                action = agent.select_move(state)
                next_state, reward, done = env.step(action)
                visits[episode_thousand][next_state - 1] += 1
                state = next_state
    print(visits / 1000)

    eps = list(range(1, 13))
    plt.subplot(2, 3, 1)
    plt.plot(eps, visits[:, 0] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S1")
    plt.grid(True)

    plt.subplot(2, 3, 2)
    plt.plot(eps, visits[:, 1] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S2")
    plt.grid(True)

    plt.subplot(2, 3, 3)
    plt.plot(eps, visits[:, 2] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S3")
    plt.grid(True)

    plt.subplot(2, 3, 4)
    plt.plot(eps, visits[:, 3] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S4")
    plt.grid(True)

    plt.subplot(2, 3, 5)
    plt.plot(eps, visits[:, 4] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S5")
    plt.grid(True)

    plt.subplot(2, 3, 6)
    plt.plot(eps, visits[:, 5] / 1000)
    plt.xlabel("Episodes (*1000)")
    plt.ylim(-0.01, 2.0)
    plt.xlim(1, 12)
    plt.title("S6")
    plt.grid(True)
    plt.show()