def main(): ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state"]) MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state"]) env = StochasticMDPEnv() agent = Agent() for episode in range(100): print("\n### EPISODE %d ###" % episode) state = env.reset() done = False while not done: goal = agent.select_goal(one_hot(state)) print("New Goal: %d" % goal) total_external_reward = 0 goal_reached = False while not done and not goal_reached: print(state, end=",") action = agent.select_move(one_hot(state), one_hot(goal)) next_state, external_reward, done = env.step(action) intrinsic_reward = agent.criticize(one_hot(state), one_hot(goal), action, one_hot(next_state)) goal_reached = next_state == goal if goal_reached: print("Success!!") exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state)) agent.store(exp, meta=False) agent.update(meta=False) agent.update(meta=True) total_external_reward += external_reward state = next_state exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state)) agent.store(exp, meta=True)
def run_architecture(meta_layers, meta_inits, meta_nodes, meta_activations, meta_loss, meta_optimizer, layers, inits, nodes, activations, loss, optimizer, n_samples, meta_n_samples, gamma, meta_epsilon, k_episodes=12): ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state"]) MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state"]) env = StochasticMDPEnv() agent = hDQN(meta_layers=meta_layers, meta_inits=meta_inits, meta_nodes=meta_nodes, meta_activations=meta_activations, meta_loss=meta_loss, meta_optimizer=meta_optimizer, layers=layers, inits=inits, nodes=nodes, activations=activations, meta_n_samples=meta_n_samples, gamma=gamma, meta_epsilon=meta_epsilon) #agent = hDQN() visits = np.zeros((k_episodes, 6)) cumulative_regret = 0 for episode_thousand in range(k_episodes): agent.meta_epsilon = agent.meta_epsilon/2.0 print("\nNew meta-epsilon: %.4f" % agent.meta_epsilon, end="") for episode in range(1000): print("\n\n### EPISODE %d ###" % (episode_thousand*1000 + episode), end="") state = env.reset() visits[episode_thousand][state-1] += 1 done = False while not done: goal = agent.select_goal(one_hot(state)) agent.goal_selected[goal-1] += 1 print("\nNew Goal: %d\nState-Actions: " % goal) total_external_reward = 0 goal_reached = False while not done and not goal_reached: action = agent.select_move(one_hot(state), one_hot(goal)) print((state,action), end="; ") next_state, external_reward, done = env.step(action) visits[episode_thousand][next_state-1] += 1 intrinsic_reward = agent.criticize(goal, next_state) goal_reached = next_state == goal if goal_reached: agent.goal_success[goal-1] += 1 print("Goal reached!!", end=" ") if next_state == 6: print("S6 reached!!", end=" ") exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state)) agent.store(exp, meta=False) agent.update(meta=False) agent.update(meta=True) total_external_reward += external_reward state = next_state exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state)) agent.store(exp, meta=True) regret = 1.00 - total_external_reward print("\nREGRET: ", regret) cumulative_regret += regret print("CUMULATIVE REGRET: ", cumulative_regret) if (episode % 100 == 99): print("") print(visits/1000, end="") return cumulative_regret, visits/1000
def main(): np.set_printoptions(precision=2) env = StochasticMDPEnv() agent = Agent() visits = np.zeros((12, 6)) for episode_thousand in range(12): for episode in range(1000): if episode % 1000 == 0 or episode % 500 == 0: print("### EPISODE %d ###" % (episode_thousand * 1000 + episode)) state = env.reset() visits[episode_thousand][state - 1] += 1 action = agent.select_move(one_hot(state)) state, reward, done = env.step(action) visits[episode_thousand][state - 1] += 1 while not done: action = agent.select_move(one_hot(state)) next_state, reward, done = env.step(action) visits[episode_thousand][next_state - 1] += 1 agent.update( one_hot(state), action, reward + agent.gamma * agent.eval(one_hot(next_state))) state = next_state eps = list(range(1, 13)) plt.subplot(2, 3, 1) plt.plot(eps, visits[:, 0] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 1.1) plt.xlim(1, 12) plt.title("S1") plt.grid(True) plt.subplot(2, 3, 2) plt.plot(eps, visits[:, 1] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 1.1) plt.xlim(1, 12) plt.title("S2") plt.grid(True) plt.subplot(2, 3, 3) plt.plot(eps, visits[:, 2] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 1.1) plt.xlim(1, 12) plt.title("S3") plt.grid(True) plt.subplot(2, 3, 4) plt.plot(eps, visits[:, 3] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 1.1) plt.xlim(1, 12) plt.title("S4") plt.grid(True) plt.subplot(2, 3, 5) plt.plot(eps, visits[:, 4] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 1.1) plt.xlim(1, 12) plt.title("S5") plt.grid(True) plt.subplot(2, 3, 6) plt.plot(eps, visits[:, 5] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 1.1) plt.xlim(1, 12) plt.title("S6") plt.grid(True) plt.show()
def main(): ActorExperience = namedtuple( "ActorExperience", ["state", "goal", "action", "reward", "next_state"]) MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state"]) env = StochasticMDPEnv() agent = hDQN() visits = np.zeros((12, 6)) for episode_thousand in range(12): agent.meta_epsilon = agent.meta_epsilon / 2.0 print("\nNew meta-epsilon: %.4f" % agent.meta_epsilon, end="") for episode in range(1000): print("\n\n### EPISODE %d ###" % (episode_thousand * 1000 + episode), end="") state = env.reset() visits[episode_thousand][state - 1] += 1 done = False while not done: goal = agent.select_goal(one_hot(state)) agent.goal_selected[goal - 1] += 1 print("\nNew Goal: %d\nState-Actions: " % goal) total_external_reward = 0 goal_reached = False while not done and not goal_reached: action = agent.select_move(one_hot(state), one_hot(goal)) print((state, action), end="; ") next_state, external_reward, done = env.step(action) visits[episode_thousand][next_state - 1] += 1 intrinsic_reward = agent.criticize(goal, next_state) goal_reached = next_state == goal if goal_reached: agent.goal_success[goal - 1] += 1 print("Goal reached!!", end=" ") if next_state == 6: print("S6 reached!!", end=" ") exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state)) agent.store(exp, meta=False) agent.update(meta=False) agent.update(meta=True) total_external_reward += external_reward state = next_state exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state)) agent.store(exp, meta=True) if (episode % 100 == 99): print("") print(visits / 1000, end="") eps = list(range(1, 13)) plt.subplot(2, 3, 1) plt.plot(eps, visits[:, 0] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S1") plt.grid(True) plt.subplot(2, 3, 2) plt.plot(eps, visits[:, 1] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S2") plt.grid(True) plt.subplot(2, 3, 3) plt.plot(eps, visits[:, 2] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S3") plt.grid(True) plt.subplot(2, 3, 4) plt.plot(eps, visits[:, 3] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S4") plt.grid(True) plt.subplot(2, 3, 5) plt.plot(eps, visits[:, 4] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S5") plt.grid(True) plt.subplot(2, 3, 6) plt.plot(eps, visits[:, 5] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S6") plt.grid(True) plt.show()
def main(): ActorExperience = namedtuple("ActorExperience", ["state", "goal", "action", "reward", "next_state", "done"]) MetaExperience = namedtuple("MetaExperience", ["state", "goal", "reward", "next_state", "done"]) env = StochasticMDPEnv() agent = hDQN() visits = np.zeros((12, 6)) anneal_factor = (1.0-0.1)/12000 print("Annealing factor: " + str(anneal_factor)) for episode_thousand in range(12): for episode in range(1000): print("\n\n### EPISODE " + str(episode_thousand*1000 + episode) + "###") state = env.reset() visits[episode_thousand][state-1] += 1 done = False while not done: goal = agent.select_goal(one_hot(state)) agent.goal_selected[goal-1] += 1 print("\nNew Goal: " + str(goal) + "\nState-Actions: ") total_external_reward = 0 goal_reached = False while not done and not goal_reached: action = agent.select_move(one_hot(state), one_hot(goal), goal) print(str((state,action)) + "; ") next_state, external_reward, done = env.step(action) visits[episode_thousand][next_state-1] += 1 intrinsic_reward = agent.criticize(goal, next_state) goal_reached = next_state == goal if goal_reached: agent.goal_success[goal-1] += 1 print("Goal reached!! ") if next_state == 6: print("S6 reached!! ") exp = ActorExperience(one_hot(state), one_hot(goal), action, intrinsic_reward, one_hot(next_state), done) agent.store(exp, meta=False) agent.update(meta=False) agent.update(meta=True) total_external_reward += external_reward state = next_state exp = MetaExperience(one_hot(state), one_hot(goal), total_external_reward, one_hot(next_state), done) agent.store(exp, meta=True) #Annealing agent.meta_epsilon -= anneal_factor avg_success_rate = agent.goal_success[goal-1] / agent.goal_selected[goal-1] if(avg_success_rate == 0 or avg_success_rate == 1): agent.actor_epsilon[goal-1] -= anneal_factor else: agent.actor_epsilon[goal-1] = 1- avg_success_rate if(agent.actor_epsilon[goal-1] < 0.1): agent.actor_epsilon[goal-1] = 0.1 print("meta_epsilon: " + str(agent.meta_epsilon)) print("actor_epsilon " + str(goal) + ": " + str(agent.actor_epsilon[goal-1])) if (episode % 100 == 99): print("") print(str(visits/1000) + "") eps = list(range(1,13)) plt.subplot(2, 3, 1) plt.plot(eps, visits[:,0]/1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S1") plt.grid(True) plt.subplot(2, 3, 2) plt.plot(eps, visits[:,1]/1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S2") plt.grid(True) plt.subplot(2, 3, 3) plt.plot(eps, visits[:,2]/1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S3") plt.grid(True) plt.subplot(2, 3, 4) plt.plot(eps, visits[:,3]/1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S4") plt.grid(True) plt.subplot(2, 3, 5) plt.plot(eps, visits[:,4]/1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S5") plt.grid(True) plt.subplot(2, 3, 6) plt.plot(eps, visits[:,5]/1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S6") plt.grid(True) plt.savefig('first_run.png') plt.show()
def main(): np.set_printoptions(precision=2) env = StochasticMDPEnv() agent = Agent() visits = np.zeros((12, 6)) for episode_thousand in range(12): for episode in range(1000): done = False state = env.reset() agent.seen_6 = False visits[episode_thousand][state - 1] += 1 while not done: action = agent.select_move(state) next_state, reward, done = env.step(action) visits[episode_thousand][next_state - 1] += 1 state = next_state print(visits / 1000) eps = list(range(1, 13)) plt.subplot(2, 3, 1) plt.plot(eps, visits[:, 0] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S1") plt.grid(True) plt.subplot(2, 3, 2) plt.plot(eps, visits[:, 1] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S2") plt.grid(True) plt.subplot(2, 3, 3) plt.plot(eps, visits[:, 2] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S3") plt.grid(True) plt.subplot(2, 3, 4) plt.plot(eps, visits[:, 3] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S4") plt.grid(True) plt.subplot(2, 3, 5) plt.plot(eps, visits[:, 4] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S5") plt.grid(True) plt.subplot(2, 3, 6) plt.plot(eps, visits[:, 5] / 1000) plt.xlabel("Episodes (*1000)") plt.ylim(-0.01, 2.0) plt.xlim(1, 12) plt.title("S6") plt.grid(True) plt.show()