class ActorCritic(object): def __init__(self, env): LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic num_features = env.observation_space.shape[0] # num_features = 14 num_actions = env.action_space.shape[0] self.action_space = env.action_space sess = tf.Session() self.actor = Actor( sess, n_features=num_features, action_bound=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A) self.critic = Critic( sess, n_features=num_features, lr=LR_C ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer()) def get_action(self, state, episode_percentage): # state = state[0:14] # Sometimes pick random action to explore if np.random.random() < self.get_exploration_prob(episode_percentage): # print 'random' return self.action_space.sample() else: # print 'not random' return self.actor.choose_action(state)[0] def get_exploration_prob(self, episode_percentage): # if (episode_percentage > .8): # epsilon = 0.3 # else: epsilon = -1 * (episode_percentage**2) + 1 # epsilon = -1 * (episode_percentage - 1) ** 3 # epsilon = -0.8 * (episode_percentage - 1) ** 3 + 0.2 # epsilon = -0.8 * episode_percentage + 1 # print epsilon return epsilon def update(self, state, action, reward, new_state): # state = state[0:14] # new_state = new_state[0:14] td_error = self.critic.learn( state, reward, new_state) # gradient = grad[r + gamma * V(s_) - V(s)] # print td_error self.actor.learn( state, action, td_error) # true_gradient = grad[logPi(s,a) * td_error] def get_name(self): return 'ActorCritic'
class ActorCriticExperienceReplay(object): def __init__(self, env): self.MEMORY_SIZE = 200 self.BATCH_SIZE = 10 LR_A = 0.001 # learning rate for actor LR_C = 0.01 # learning rate for critic num_features = env.observation_space.shape[0] num_actions = env.action_space.shape[0] self.action_space = env.action_space sess = tf.Session() self.actor = Actor( sess, n_features=num_features, action_bound=[env.action_space.low[0], env.action_space.high[0]], lr=LR_A) self.critic = Critic( sess, n_features=num_features, lr=LR_C ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer()) self.replay_memory = [] def get_action(self, state, episode_percentage): # Sometimes pick random action to explore if np.random.random() < self.get_exploration_prob(episode_percentage): return self.action_space.sample() else: return self.actor.choose_action(state)[0] def get_exploration_prob(self, episode_percentage): return -1 * (episode_percentage**2) + 1 # return -1 * (episode_percentage - 1) ** 3 def update(self, state, action, reward, new_state): td_error = self.critic.learn( state, reward, new_state) # gradient = grad[r + gamma * V(s_) - V(s)] self.actor.learn( state, action, td_error) # true_gradient = grad[logPi(s,a) * td_error] # Add to replay memory self.replay_memory.append((state, action, reward, new_state)) if len(self.replay_memory) >= self.MEMORY_SIZE: self.replay_memory.pop(0) # Learn from replayed memories if np.random.random() < 0.5 and len( self.replay_memory) > self.BATCH_SIZE: minibatch = random.sample(self.replay_memory, self.BATCH_SIZE) for (batch_state, batch_action, batch_reward, batch_new_state) in minibatch: td_error = self.critic.learn(batch_state, batch_reward, batch_new_state) self.actor.learn(batch_state, batch_action, td_error) def get_name(self): return 'ActorCritic_ExperienceReplay'
s = env.reset() t = 0 track_r = [] while True: if RENDER:env.render() a = actor.choose_action(s) s_,r,done,info = env.step(a) if done: r = -20 track_r.append(r) td_error = critic.learn(s,r,s_) actor.learn(s,a,td_error) s = s_ t += 1 if done or t >= 1000: ep_rs_sum = sum(track_r) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.95 + ep_rs_sum*0.05 if running_reward > DISPLAY_REWARD_THRESHOLD:RENDER=True print('episode:',i_episode,'reward:',int(running_reward)) break
def train(self, max_episode=10, max_path_length=200, verbose=0): env = self.env avg_reward_sum = 0. #f_eps = open("episode.csv","w") #write_eps = csv.write(f_eps) for e in range(max_episode): env._reset() observation = env._reset() game_over = False reward_sum = 0 inputs = [] outputs = [] predicteds = [] rewards = [] #f_iter = open("episode_{0}.csv".format(e),"w") #write_iter = csv.writer(f_iter) f_episode = "episode_{0}.csv".format(e) os.system("rm -rf {0}".format(f_episode)) print(observation[0].shape, observation[1].shape) sess = tf.Session() actor = Actor(sess, n_actions=self.env.action_space.n # output_graph=True, ) critic = Critic( sess, n_actions=self.env.action_space.n ) # we need a good teacher, so the teacher should learn faster than the actor sess.run(tf.global_variables_initializer()) while not game_over: action, aprob = actor.choose_action(observation) inputs.append(observation) predicteds.append(aprob) y = np.zeros([self.env.action_space.n]) y[action] = 1. outputs.append(y) observation_, reward, actual_reward, game_over, info = self.env._step( action) reward_sum += float(actual_reward) print(reward) #rewards.append(float(reward)) rewards.append(float(reward)) # After env.step td_error = critic.learn( observation, reward_sum, observation_) # gradient = grad[r + gamma * V(s_) - V(s)] actor.learn( observation, action, td_error) # true_gradient = grad[logPi(s,a) * td_error] # check memory for RNN model if len(inputs) > self.max_memory: del inputs[0] del outputs[0] del predicteds[0] del rewards[0] if verbose > 0: if env.actions[action] == "LONG" or env.actions[ action] == "SHORT": #if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD": color = bcolors.FAIL if env.actions[ action] == "LONG" else bcolors.OKBLUE print("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ]))) #write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())]))) os.system("echo %s >> %s" % ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join([ "%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist()) ])), f_episode)) avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01 toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % ( e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum) print(toPrint) if self.history_filename != None: os.system("echo %s >> %s" % (toPrint, self.history_filename)) dim = len(inputs[0]) inputs_ = [[] for i in range(dim)] for obs in inputs: for i, block in enumerate(obs): inputs_[i].append(block[0]) inputs_ = [np.array(inputs_[i]) for i in range(dim)] outputs_ = np.vstack(outputs) predicteds_ = np.vstack(predicteds) rewards_ = np.vstack(rewards) print("shape: ", np.shape(rewards)) print("fit model input.shape %s, output.shape %s" % ([inputs_[i].shape for i in range(len(inputs_))], outputs_.shape)) np.set_printoptions(linewidth=200, suppress=True) print("currentTargetIndex:", env.currentTargetIndex)
buffer_s, buffer_a, buffer_v = np.vstack( buffer_s), np.vstack(buffer_a), np.vstack(buffer_v) break ep_s.append(buffer_s) ep_a.append(buffer_a) ep_v.append(buffer_v) ep_r.extend(buffer_r) ep_w.extend(butter_w) print "================", "Train EP", i, "================" ep_td, ep_c_loss, ep_a_loss = [], [], [] for j in xrange(pa.batch_num): td_error, critic_loss = critic.learn(ep_s[j], ep_v[j]) actor_loss = actor.learn(ep_s[j], ep_a[j], td_error) ep_td.append(td_error) ep_c_loss.append(critic_loss) ep_a_loss.append(actor_loss) ep_td = np.concatenate(ep_td) ep_a = np.concatenate(ep_a) ep_c_loss = np.array(ep_c_loss) ep_a_loss = np.array(ep_a_loss) unique, counts = np.unique(ep_a, return_counts=True) dict_a = dict(zip(unique, counts)) print \ "EP:", i, "\n", \