print("Model loaded from disk") # define action discretization max_a = env.action_space.high[0] min_a = env.action_space.low[0] act = actions(ACTION_SIZE, max_a) actions_deque, _ = act.get_action() discretizer = Discretizer(actions_deque) policy = ChasePolicy(STATE_SIZE, ACTION_SIZE, max_a, min_a) n = OUnoise(2, 0.5, 50) ################ # simulators ################ simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, actions_deque) # main learning loop print("Starting to learn in environment: " + ENVIRONMENT) steps = 0 for episode_i in xrange(1, EPISODES + 1): policy_exp = np.random.uniform() if policy_exp <= EPSILON: onPolicy = True n.Reset() else: onPolicy = False st = env.reset() mdp.reset()
print("Model loaded from disk") # define action discretization max_a = env.action_space.high[0] min_a = env.action_space.low[0] act = actions(ACTION_SIZE, max_a) actions_deque,_ = act.get_action() discretizer = Discretizer(actions_deque) policy = ChasePolicy(STATE_SIZE, ACTION_SIZE, max_a, min_a) n = OUnoise(2,0.5,NOISE) ################ # simulators ################ simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, actions_deque) # main learning loop print("Starting to learn in environment: " + ENVIRONMENT) steps = 0 for episode_i in xrange(1,EPISODES+1): policy_exp = np.random.uniform() n.Reset() if policy_exp <= EPSILON_P: onPolicy = True else: onPolicy = False st = env.reset() mdp.reset()
if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) print("Model loaded from disk") # define action discretization max_a = env.action_space.high[0] min_a = env.action_space.low[0] act = actions(ACTION_SIZE, max_a) actions_deque, _ = act.get_action() discretizer = Discretizer(actions_deque) ################ # simulators ################ simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, actions_deque) # main learning loop print("Starting to learn in environment: " + ENVIRONMENT) steps = steps_counter.evaluate(sess) C_steps_counter.evaluate(sess) for episode_i in xrange(1, EPISODES + 1): episodes_counter.increment(sess) st = env.reset() mdp.add_frame(st) st = mdp.get_MDP_state() totalR = 0 totalE = 0 for t in xrange(1, STEPS + 1): if DISPLAY: env.render()
def train(sess, env, actor, critic): env_left = gym.make(ENV_LEFT) env_middle = gym.make(ENV_MIDDLE) env_right = gym.make(ENV_RIGHT) L = Logger() log_not_empty = L.Load(LOG_FILE) if log_not_empty: print("Log file loaded") else: ("Creating new log file") L.AddNewLog('network_left') L.AddNewLog('network_middle') L.AddNewLog('network_right') L.AddNewLog('total_reward') L.AddNewLog('estimated_value') L.AddNewLog('network_random') simulator = Simulator(MAX_EP_STEPS, STATE, 1, -0.5, None) # Set up summary Ops summary_ops, summary_vars = build_summaries() sess.run(tf.initialize_all_variables()) writer = tf.train.SummaryWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) n = OUnoise(INPUT) for i in xrange(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 n.Reset() for j in xrange(MAX_EP_STEPS): if RENDER_ENV: env.render() # Added exploration noise #a = actor.predict(np.reshape(s, (1, 8))) + (1. / (1. + i + j)) a = actor.predict(np.reshape(s, (1, STATE))) + n.Sample() s2, r, terminal, info = env.step(a[0]) r += -0.5 replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r, \ terminal, np.reshape(s2, (actor.s_dim,))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r if terminal: break summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print 'episode ', i, ' | Reward: %.2i' % int(ep_reward), " | Episode", i, \ '| Qmax: %.4f' % (ep_ave_max_q / float(j)) # log statistics L.AddRecord( 'network_left', simulator.SimulateContNeuralEpisode(actor, sess, env_left, False)) L.AddRecord( 'network_middle', simulator.SimulateContNeuralEpisode(actor, sess, env_middle, False)) L.AddRecord( 'network_right', simulator.SimulateContNeuralEpisode(actor, sess, env_right, False)) temp_r = 0 for rand_i in xrange(10): temp_r = temp_r + simulator.SimulateContNeuralEpisode( actor, sess, env, False) * 0.1 L.AddRecord('network_random', temp_r) L.AddRecord('total_reward', ep_reward) if replay_buffer.size() > V_EST: num = V_EST else: num = replay_buffer.size() s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( num) Q = critic.predict(s_batch, actor.predict(s_batch)) V_est = Q.sum() / num * 1.0 L.AddRecord('estimated_value', V_est) if i % SAVE_RATE == 0: L.Save(LOG_FILE)
if log_not_empty: print("Log file loaded") else: ("Creating new log file") if ENVIRONMENT_NAME is 'Hockey-v2': L.AddNewLog('network_left') L.AddNewLog('network_middle') L.AddNewLog('network_right') L.AddNewLog('network_random') # L.AddNewLog('error') L.AddNewLog('total_reward') L.AddNewLog('estimated_value') L.AddNewLog('network_random') if ENVIRONMENT_NAME is 'Hockey-v2': simulator = Simulator(STEPS, STATE_SIZE, FRAMES, T, None) steps = steps_counter.evaluate(sess) C_steps_counter.evaluate(sess) for ep in range(EPISODES): episodes_counter.increment(sess) # open up a game state s_t, r_0, done = env.reset(), 0, False n.Reset() REWARD = 0 totalR = 0 totalE = 0 # exploration.reset() for t in range(STEPS): if DISPLAY: env.render() # select action according to current policy and exploration noise