def train(sess, env, actor, critic): env_left = gym.make(ENV_LEFT) env_middle = gym.make(ENV_MIDDLE) env_right = gym.make(ENV_RIGHT) L = Logger() log_not_empty = L.Load(LOG_FILE) if log_not_empty: print("Log file loaded") else: ("Creating new log file") L.AddNewLog('network_left') L.AddNewLog('network_middle') L.AddNewLog('network_right') L.AddNewLog('total_reward') L.AddNewLog('estimated_value') L.AddNewLog('network_random') simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None) # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) n = OUnoise(INPUT) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 n.Reset() for j in xrange(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j)) a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample() s2, r, terminal, info = env.step(a[0]) r += -0.5 replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: break summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) # log statistics L.AddRecord( 'network_left', simulator.SimulateContNeuralEpisode(actor, sess, env_left, False)) L.AddRecord( 'network_middle', simulator.SimulateContNeuralEpisode(actor, sess, env_middle, False)) L.AddRecord( 'network_right', simulator.SimulateContNeuralEpisode(actor, sess, env_right, False)) temp_r = 0 for rand_i in xrange(10): temp_r = temp_r + simulator.SimulateContNeuralEpisode( actor, sess, env, False) * 0.1 L.AddRecord('network_random', temp_r) L.AddRecord('total_reward', ep_reward) if replay_buffer.size() > V_EST: num = V_EST else: num = replay_buffer.size() s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( num) Q = critic.predict(s_batch, actor.predict(s_batch)) V_est = Q.sum() / num * 1.0 L.AddRecord('estimated_value', V_est) if i % SAVE_RATE == 0: L.Save(LOG_FILE)
tflogger.add_summary(log, log_counter.evaluate(sess)) print("episode %d/%d (%d), reward: %f" % (episode_i, EPISODES, log_counter.evaluate(sess), totalR)) else: print("episode %d/%d, played on policy, reward: %f" % (episode_i, EPISODES, totalR)) # save model if episode_i % SAVE_RATE == 0: save_counter.increment(sess) saver.save(sess, OUT_DIR + "model.ckpt", global_step=save_counter.evaluate(sess)) R.SaveBuffer(OUT_DIR + BUFFER_FILE) print "model saved, replay buffer: ", R.GetOccupency() L.Save(OUT_DIR + LOG_FILE) Ws, bs = Q.get_weights() Q_target.assign(sess, Ws, bs) save_counter.increment(sess) saver.save(sess, OUT_DIR + "model.ckpt", global_step=save_counter.evaluate(sess)) R.SaveBuffer(OUT_DIR + BUFFER_FILE) print "model saved, replay buffer: ", R.GetOccupency() L.Save(OUT_DIR + LOG_FILE) sess.close() # plot statistics R_P_l = L.GetLogByName('policy_left')
temp = [] for l in xrange(numlogs): temp.append(all_logs[l][i]) avg.append(sum(temp)/len(temp)*1.0) avgep = [] var = 0 avgep_up = [] avgep_down = [] for i in xrange(length-LEN): a = sum(avg[i:i+LEN])/LEN*1.0 var = np.sqrt(np.var(avg[i:i+LEN])) avgep.append(a) avgep_up.append(avgep[i] + var) avgep_down.append(avgep[i] - var) L.AddRecord(logname,avgep[i]) L.AddRecord(logname+'_up',avgep_up[i]) L.AddRecord(logname+'_down',avgep_down[i]) L.Save(OUTLOG) t = np.arange(1,len(avgep)+1) var_up = np.array(avgep_up) var_down = np.array(avgep_down) plt.figure(1) plt.fill_between(t,var_down, var_up, facecolor='blue', linewidth=0.0,alpha=0.5) plt.plot(t,avgep,'b') plt.show()