t_min, t_max = 0.4, 0.6 # Algorithm rewards_all_episodes = [] # Stats deleted_all_episodes = [] corrected_all_episodes = [] incorrect_all_episodes = [] for episode in range(num_episodes): # Repair episode initialization R_trajectory = 0 trajectory = [] # Q Learning episode initialization state_v = env.reset() state_c = state_to_coord(state_v) done = False print(f"{episode + 1} / {num_episodes}") # Play episode for step in range(max_steps_per_episode): # Exploration vs Exploitation exploration_rate_threshold = random.uniform(0, 1) if exploration_rate_threshold > exploration_rate: action = np.argmax(q_table[state_c, :]) else: action = np.random.choice([x for x in range(action_space_size)])
max_steps_per_episode = 1000000 learning_rate = 0.1 # alpha discount_rate = 0.99 # gamma exploration_rate = 1 # epsilon max_exploration_rate = 1 min_exploration_rate = 0.01 exploration_decay_rate = 0.001 # Algorithm rewards_all_episodes = [] # Q learning for episode in range(num_episodes): state = env.reset() state = state_to_coord(state) done = False rewards_current_episode = 0 print(f"{episode + 1} / {num_episodes}") # Play episode for step in range(max_steps_per_episode): # Exploration vs Exploitation exploration_rate_threshold = random.uniform(0, 1) if exploration_rate_threshold > exploration_rate: action = np.argmax(q_table[state, :]) else: action = np.random.choice([x for x in range(action_space_size)])
agent.store_demonstration(s, a, r, s_, done, int(episode_idx)) if done: episode_idx += 1 # Pre-train #agent.replay.tree.start = start for i in range(k1): if i % 100 == 0: print("pretraining:", i) agent.learn() # Train accumulated_rewards_all_episodes = [] for episode in range(k2): s = env.reset() accumulated_rewards = 0 done = False while not done: a = agent.choose_action(s) s_, r, done, feedback = env.step(a[0]) accumulated_rewards += r r += feedback if done: r = 0 # todo not sure if this is necessary, just try with, without, with different value agent.store_transition(s, a, r, s_, done) agent.learn() s = s_