def __init__(self,input_size,output_size,hidden_size,lr,beta,gamma,update_epoch,epsilon): self.input_size = input_size # 即state's shape self.output_size = output_size # action's shape self.hidden_size = hidden_size self.lr = lr #学习率 self.beta = beta # for optimizer self.gamma = gamma #衰减因子 self.update_epoch = update_epoch #更新policy的回合数 self.epsilon= epsilon #clip时需要 # 创建 pi 和 pi_old self.policy = ActorCritic(input_size,output_size,hidden_size).to(device) self.old_policy = ActorCritic(input_size,output_size,hidden_size).to(device) # use adam optimizer to update nn's weight self.optimizer = torch.optim.Adam( self.policy.parameters(), lr=lr, betas=beta ) # load policy's model self.old_policy.load_state_dict(self.policy.state_dict()) self.loss = nn.MSELoss()
def _create(index): name = 'bullet-{0}-{1}'.format(self.index, index) pi = Policy(N_S, N_A, name) ac = ActorCritic(self.sess, pi, bullet_config.optimizer, global_pi=bullet_config.global_pi, entropy_beta=ENTROPY_BETA) head = BulletHead(self.env, ac, update_step=UPDATE_GLOBAL_ITER, gamma=GAMMA, lambda_=LAMBDA) return head
def run(render=False): env = gym.make(GAME_NAME).unwrapped env.reset() N_S, N_A = env.observation_space.shape, 4 #env.action_space.n env.close() sess = tf.InteractiveSession() #optimizer = tf.train.RMSPropOptimizer(LR, name='RMSPropA') optimizer = tf.train.AdamOptimizer(LR) global_pi = Policy(N_S, N_A, 'global') # Create train worker workers = [] for i in range(N_WORKERS): i_name = 'pi_%i' % i # worker name env = gym.make(GAME_NAME).unwrapped pi = Policy(N_S, N_A, i_name) ac = ActorCritic(sess, pi, optimizer, global_pi=global_pi, entropy_beta=ENTROPY_BETA) worker = Worker(ac, env, GAMMA, LAMBDA) workers.append(worker) # init variables sess.run(tf.global_variables_initializer()) # train workers worker_threads = [] for i in range(len(workers)): worker = workers[i] if len(workers) > 1 and i == 0: job = lambda: worker.test(render=render) else: job = lambda: worker.train(update_nsteps=UPDATE_GLOBAL_ITER) t = threading.Thread(target=job) t.start() worker_threads.append(t) # wait COORD = tf.train.Coordinator() COORD.join(worker_threads)
db_dict = pickle.load(open(DICT_FILE_PATH, 'rb'), encoding='latin1') # Load goal File user_goals = pickle.load(open(USER_GOALS_FILE_PATH, 'rb'), encoding='latin1') # Init. Objects if USE_USERSIM: user = UserSimulator(user_goals, constants, database) else: user = User(constants) emc = ErrorModelController(db_dict, constants) state_tracker = StateTracker(database, constants) # sarsa_agent = SARSAgent(state_tracker.get_state_size(), constants) sess = K.get_session() ac_agent = ActorCritic(state_tracker.get_state_size(), constants, sess) #dqn_agent = DQNAgent(state_tracker.get_state_size(), constants) def run_round(state, warmup=False): # 1) Agent takes action given state tracker's representation of dialogue (state) agent_action = ac_agent.act(state) # 2) Update state tracker with the agent's action state_tracker.update_state_agent(agent_action) # 3) User takes action given agent action user_action, reward, done, success = user.step(agent_action) if not done: # 4) Infuse error into semantic frame level of user action emc.infuse_error(user_action) # 5) Update state tracker with user action state_tracker.update_state_user(user_action)
class Agent: def __init__(self,input_size,output_size,hidden_size,lr,beta,gamma,update_epoch,epsilon): self.input_size = input_size # 即state's shape self.output_size = output_size # action's shape self.hidden_size = hidden_size self.lr = lr #学习率 self.beta = beta # for optimizer self.gamma = gamma #衰减因子 self.update_epoch = update_epoch #更新policy的回合数 self.epsilon= epsilon #clip时需要 # 创建 pi 和 pi_old self.policy = ActorCritic(input_size,output_size,hidden_size).to(device) self.old_policy = ActorCritic(input_size,output_size,hidden_size).to(device) # use adam optimizer to update nn's weight self.optimizer = torch.optim.Adam( self.policy.parameters(), lr=lr, betas=beta ) # load policy's model self.old_policy.load_state_dict(self.policy.state_dict()) self.loss = nn.MSELoss() # 更新网络 def update(self,memory): rewards = [] discounted_reward = 0 # MC to calculate every state reward for reward,is_done in zip(reversed(memory.rewards),reversed(memory.is_done)): if is_done: discounted_reward = 0 # 进行累加 discounted_reward = reward + (self.gamma * discounted_reward) rewards.insert(0,discounted_reward) # list to tensor rewards = torch.tensor(rewards).to(device) #normalize rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # prepare to update policy old_states = torch.stack(memory.states).to(device).detach() old_actions = torch.stack(memory.actions).to(device).detach() old_logprobs = torch.stack(memory.logprobs).to(device).detach() # update for _ in range(self.update_epoch): # critic evaluate data sampled by old policy logprobs,states_value,dist_entropy = self.policy.evaluate(old_states,old_actions) # begin to calculate J_ppo # benefit of using log is that it uses minus op instead of divide # use log so need to do exp op after minus op frac = torch.exp(logprobs - old_logprobs.detach()) # calculate advantage function advantages = rewards - states_value.detach() item_1 = frac * advantages # clip func item_2 = torch.clamp(frac,1-self.epsilon,1+self.epsilon) * advantages # calculate loss function loss = -torch.min(item_1,item_2) + 0.5 * self.loss(states_value,rewards) - 0.01 *dist_entropy # update self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # load updated policy to old_policy self.old_policy.load_state_dict(self.policy.state_dict())
def main(): env = gym.make('CartPole-v1') state_size = env.observation_space.shape[0] num_actions = env.action_space.n model = ActorCritic(num_actions) ep_rewards = 0.0 best_reward_so_far = 0.0 for epoch in range(N_epochs): s = env.reset() done = False states = [] next_states = [] actions = [] rewards = [] dones = [] steps = 0 while not done: prob, _ = model(s[None, :]) action = np.random.choice(num_actions, p=np.squeeze(prob)) action_prob = prob[0, action] s_prime, rwd, done, _ = env.step(action) ep_rewards += rwd states.append(s) actions.append(action) rewards.append(rwd) next_states.append(s_prime) dones.append(done) if steps == T_steps or done: with tf.GradientTape() as tape: loss = model.compute_loss(states, [s_prime], actions, rewards, dones) gradient = tape.gradient(loss, model.trainable_variables) model.optimizer.apply_gradients( zip(gradient, model.trainable_variables)) states = [] next_states = [] actions = [] rewards = [] dones = [] steps = 0 s = s_prime steps += 1 # if epoch > min_epochs and best_reward_so_far < ep_rewards : # best_reward_so_far = ep_rewards # model.save('ac_model') # print("model saved") if (epoch + 1) % 20 == 0: print("Epoch [%d/%d] : Reward %d" % (epoch + 1, N_epochs, ep_rewards / 20)) ep_rewards = 0.0
def execute(self): sess = tf.Session() K.set_session(sess) env = Gazeboworld() actor_critic = ActorCritic(env) t = time.time() value = datetime.fromtimestamp(t) t_str = value.strftime('%m_%d_%H_%M') dir_p = t_str + '_weights' dir_a = t_str + '_weights/actor' dir_c = t_str + '_weights/critic' try: os.makedirs(dir_p) os.makedirs(dir_a) os.makedirs(dir_c) except OSError as e: if e.errno != errno.EEXIST: raise if len(self.args) > 2: actor_critic.load_trained_model(self.args[1], self.args[2]) # sys.exit(0) def saveWeights(actor_critic, episode, dir_a, dir_c): weights = actor_critic.target_actor_model.save_weights( dir_a + '/w_E{}.h5'.format(episode)) weights = actor_critic.target_critic_model.save_weights( dir_c + '/w_E{}.h5'.format(episode)) print 'Weights Saved For Episode: {}'.format(episode) num_trails = 10000 trial_len = 5000 for i in xrange(num_trails): cur_state = env.reset() done = False while not done: cur_state = cur_state.reshape( (1, env.observation_space.shape[0])) action = actor_critic.act(cur_state) action = action.reshape((1, env.action_space.shape[0])) new_state, reward, done = env.step(action) # new_state, reward, done = env.step([[1,1]]) try: new_state, reward, done = env.step(action) except Exception as e: print e continue new_state = new_state.reshape( (1, env.observation_space.shape[0])) actor_critic.remember(cur_state, action, reward, new_state, done) actor_critic.train() # print env.isDead(), done, min(env.laser.getLaserData().distanceData[:360]) if done: env.reset() print "Done....<<<<<<<<<<<<<<<<<<... ", env.death # cur_state = np.array(env.state) sys.stdout.flush() print "Length of memory: {}".format(len(actor_critic.memory)) actor_critic.epsilon *= actor_critic.epsilon_decay print "EPSILON =================================> {}".format( actor_critic.epsilon) saveWeights(actor_critic, i, dir_a, dir_c)