for i in range(NUM_EPISODES): cur_state = env.reset() cum_reward = 0 # tensorboard summary summary_writer = tf.summary.FileWriter('/tmp/pendulum-log-0'+'/train', graph=tf.get_default_graph()) if (i % EVALUATE_EVERY) == 0: print ('====evaluation====') for t in range(MAX_STEPS): if (i % EVALUATE_EVERY) == 0: env.render() action = agent.get_action(cur_state, sess)[0] else: # decaying noise action = agent.get_action_noise(cur_state, sess, rate=(NUM_EPISODES-i)/NUM_EPISODES)[0] next_state, reward, done, info = env.step(action) if (i % EVALUATE_EVERY) == 0: print('cur_state: ', end='') print(cur_state) print('action: ', end='') print(action) print('reward: ' + str(reward)) print('------------------------------------------------------') if done: cum_reward += reward agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) print("Done! Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward)) summarize(cum_reward, i, summary_writer) break cum_reward += reward
agent = DDPG(actor=actor, critic=critic, exprep=exprep, noise=noise, action_bound=setting.ACTION_RANGE) sess.run(tf.initialize_all_variables()) for i in range(setting.NUM_EPISODES): cur_state = env.reset() cum_reward = 0 if (i % setting.EVALUATE_EVERY) == 0: print ('====evaluation====') for t in range(setting.MAX_STEPS): print("Time step: " + str(t)) if (i % setting.EVALUATE_EVERY) == 0: env.render() action = agent.get_action(cur_state, sess)[0] else: # decaying noise action = agent.get_action_noise(cur_state, sess, rate=(setting.NUM_EPISODES-i)/setting.NUM_EPISODES)[0] action = convertToPositive(action) next_state, reward, done, info = env.step(action) infos = info.split(',') rew_history.append(reward) util_history.append(float(infos[2])) delay_history.append(float(infos[3])) cum_reward += reward agent.add_step(Step(cur_step=cur_state, action=action, next_step=next_state, reward=reward, done=done)) if (i % setting.EVALUATE_EVERY) == 0: printer.print_state(cur_state) printer.do_job('action', action) printer.do_job('reward, util, delay', [reward, float(infos[2]), float(infos[3])]) printer.do_line() if done or t == setting.MAX_STEPS - 1: print("Done! Episode {} finished after {} timesteps, cum_reward: {}".format(i, t + 1, cum_reward))