def learn(self, env, episode_count=1000, gamma=0.9, learning_rate=0.1, render=False, report_interval=50): """QLearningAgent.learn env:環境のデータが格納された変数です。 episode_count:エピソード回数を指定します。default:1000 gamma:割引率を指定します。default:0.9 render:画面に様子を表示するかどうか設定します。default:False report_interval:ログを保存する間隔を設定します。default:50""" self.init_log() actions = list(range(env.action_space.n)) self.Q = defaultdict(lambda: [0] * len(actions)) for e in tqdm(range(episode_count)): s = env.reset() done = False while not done: if render: cuitools.reset() env.render() time.sleep(0.01) a = self.policy(s, actions) n_state, reward, done, info = env.step(a) gain = reward + gamma * max(self.Q[n_state]) estimated = self.Q[s][a] self.Q[s][a] += learning_rate * (gain - estimated) s = n_state else: self.log(reward) if e != 0 and e % report_interval == 0: pass
def learn(self, env, episode_count=1000, gamma=0.9, render=False, report_interval=50): """MonteCarloAgent.learn env:環境のデータが格納された変数です。 episode_count:エピソード回数を指定します。default:1000 gamma:割引率を指定します。default:0.9 render:画面に様子を表示するかどうか設定します。default:False report_interval:ログを保存する間隔を設定します。default:50""" self.init_log() actions = list(range(env.action_space.n)) self.Q = defaultdict(lambda: [0] * len(actions)) N = defaultdict(lambda: [0] * len(actions)) for e in tqdm(range(episode_count)): s = env.reset() done = False # Play until the end of episode. experience = [] while not done: if render: cuitools.reset() env.render() time.sleep(0.01) a = self.policy(s, actions) n_state, reward, done, info = env.step(a) experience.append({"state": s, "action": a, "reward": reward}) s = n_state else: self.log(reward) # Evaluate each state, action. for i, x in enumerate(experience): s, a = x["state"], x["action"] # Calculate discounted future reward of s. G, t = 0, 0 for j in range(i, len(experience)): G += math.pow(gamma, t) * experience[j]["reward"] t += 1 N[s][a] += 1 # count of s, a pair alpha = 1 / N[s][a] self.Q[s][a] += alpha * (G - self.Q[s][a]) if e != 0 and e % report_interval == 0: pass
def train(self, env, episode_count=1000, gamma=0.9, learning_rate=0.1, render=False, report_interval=50): """actor_critic.train env:環境のデータが格納された変数です。 episode_count:エピソード回数を指定します。default:1000 gamma:割引率を指定します。default:0.9 render:画面に様子を表示するかどうか設定します。default:False report_interval:ログを保存する間隔を設定します。default:50""" actor = self.actor_class(env) critic = self.critic_class(env) actor.init_log() for e in tqdm(range(episode_count)): s = env.reset() done = False while not done: if render: cuitools.reset() env.render() time.sleep(0.01) a = actor.policy(s) n_state, reward, done, info = env.step(a) gain = reward + gamma * critic.V[n_state] estimated = critic.V[s] td = gain - estimated actor.Q[s][a] += learning_rate * td critic.V[s] += learning_rate * td s = n_state else: actor.log(reward) if e != 0 and e % report_interval == 0: pass # actor.show_reward_log(episode=e) return actor, critic
_ = plus_proche(T, ref, diss) fin = time() duree = fin - debut print("Duree (moyen): ", duree) n = 100000 # grand T = [] for _ in range(n): T.append(randint(0, n)) ref = randint(0, n) debut = time() _ = plus_proche(T, ref, diss) fin = time() duree = fin - debut print("Duree (grand): ", duree) print(f"Duree (grand): {tps_plus_proche(n)}") N = [] Tps = [] for n in tqdm(range(10, 10**5, 200)): N.append(n) # calcul Tps.append(tps_plus_proche(n)) #affichage de la courbe N vs Tps plt.figure() plt.plot(N, Tps) plt.show()
# tqdm a installer #from tqdm import tqdm # programme python classique #from tqdm.notebook import tqdm # dans un notebook from tqdm.gui import tqdm # pour thonny from time import sleep from random import randint for i in tqdm(range(5), desc='Boucle sur i'): for j in tqdm(range(3), desc=f'Boucle sur j', leave=False): duree_sommeil = randint(1, 2) sleep(duree_sommeil)