render_demo = False imax = 5000 period_print = 100 eval_periods = 100 print("\nStarting") total_reward = 0 period_rewards = deque(maxlen=2 * period_print) best_avg_reward = float('-inf') procStartTime = time.time() for i in range(1, imax + 1): doEpisode(experiment) processLastReward(task, agent) ## store final reward for learner agent.learn() reward = task.getCumulativeReward() total_reward += reward period_rewards.append(reward) avg_reward = np.mean(period_rewards) if avg_reward > best_avg_reward: best_avg_reward = avg_reward if i % period_print == 0: print( "Episode ended: %i/%i period reward: %f total reward: %d best avg reward: %f rate: %f" % (i, imax, avg_reward, total_reward, best_avg_reward, total_reward / i))
render_demo = False render_steps = False imax = 7000 period_print = 100 eval_periods = 100 print("\nStarting") total_reward = 0 period_reward = 0 procStartTime = time.time() for i in range(1, imax + 1): doEpisode(experiment, render_steps) reward = task.getCumulativeReward() total_reward += reward period_reward += reward processLastReward(task, agent) ## store final reward for learner agent.learn() if i % period_print == 0: epsil = explorer.epsilon print( "Episode ended: %i/%i period reward: %f total reward: %d rate: %f epsilon: %f" % (i, imax, period_reward / period_print, total_reward, total_reward / i, epsil)) period_reward = 0
def doEpisode(self, demonstrate=False): doEpisode(self.exp, demonstrate)