def loop(n): logger_her.info("***************************") logger_her.info("**** Bit flipping game ****") logger_her.info("***************************") logger_her.info("Start main loop with size {}".format(n)) logger_her.info("HER STATUS: {}".format(HER)) actor = QModel(n, HER) critic = QModel(n, HER) if not TRAIN_FROM_SCRATCH: actor.load() critic.load() else: logger_her.info("Training QNetworks from scratch") re_buffer = Buffer(BUFFER_SIZE) for epoch in range(EPOCHS): logger_her.info("Start epoch {}".format(epoch + 1)) for episode_idx in range(EPISODES): goal = State.sample_status(n) start = State.sample_status(n) # here we will going to store start and goal in a state object state = State(start, goal) _, episode = sample_episode(actor, state, epsilon_greedy=True) re_buffer.add(episode) if HER: new_experience = [] for s, a, r, sn in episode: for t in _sample(n, HER_NEW_GOALS): _g = episode[t][-1].status _sn = State(sn.status.copy(), _g.copy()) exp = (State(s.status.copy(), _g.copy()), a, 0 if _sn.is_final else -1, _sn) new_experience.append(exp) re_buffer.add(new_experience) for training_step in range(TRAINING_STEPS): minibatch = re_buffer.sample(BATCH_SIZE) train(critic, actor, minibatch) if (epoch + 1) % UPDATE_ACTOR == 0: actor.update(critic) success_rate = evaluate_actor(actor) re_buffer.log_stats() if success_rate >= 1. - 1e-9: logger_her.info("Learned policy (QAction-Value) for {} bits in {} epochs".format(n, epoch + 1)) break
def evaluate_actor(actor, episodes_count=TESTING_EPISODES, verbose=0, pause=0): success_counter = 0 for episode_ev in range(episodes_count): start = State.sample_status(actor.n) goal = State.sample_status(actor.n) success, _ = sample_episode(actor, State(start, goal), epsilon_greedy=False, verbose=verbose) success_counter += int(success) if pause: input("Press <Enter> to continue...") logger_her.info("Success/Total {}/{}".format(success_counter, episodes_count)) logger_her.info("Success rate: {}".format(success_counter / episodes_count)) return success_counter / episodes_count