def train_actor(actor_model, critic_model,noisy_actor_model, state_transitions, num_actor_training_samples, num_actions): #for each observation get the critic to generate the Q value corresponding to each action_space #retain action observation pairs corresponding to the highest Q values #train the actor to converge towards that set #Generate random actions random_actions = [] for i in range(num_actor_training_samples): random_actions.append( np.random.rand(num_actions)*2-1 ) # import ipdb; ipdb.set_trace() #Get random observations for i in range(len(state_transitions)): random_actions.append(state_transitions[i].action) random_states = [s.state for s in state_transitions] # import ipdb; ipdb.set_trace() # for earch state add the best corresponding action to random actions for i in range(len(random_states)): with torch.no_grad(): act = actor_model(torch.Tensor(random_states[i]).to(actor_model.device)) .cpu().detach().numpy() random_actions.append(act) act = noisy_actor_model(torch.Tensor(random_states[i]).to(noisy_actor_model.device)) .cpu().detach().numpy() random_actions.append(act) best_state_action = [] for i_states in range(len(random_states)): QAs = [] # get the Qvalues from the random actions for i_actions in range(len(random_actions)): with torch.no_grad(): qval = critic_model( torch.Tensor( torch.cat( (torch.Tensor(random_states[i_states]),torch.Tensor(random_actions[i_actions])),0 ) ).to(critic_model.device) ).cpu() QAs.append( qval ) # get index for best actions between all random actions and the actor's predicted actions #_sars = sars(observation,action,reward,observation_next,done,0.0) best_state_action.append(sars(random_states[i_states], random_actions[np.argmax(QAs)],0.0,None,False,np.max(QAs) )) # import ipdb;ipdb.set_trace() t_random_states = torch.stack( ([torch.Tensor(s.state) for s in best_state_action]) ).to(actor_model.device) target_actions = torch.stack( ([torch.Tensor(s.action) for s in best_state_action]) ).to(actor_model.device) actor_model.zero_grad() predicted_actions = actor_model(t_random_states) # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) ) # loss = torch.sqrt(torch.sum( (predicted_actions - target_actions)**2,1 ) ) .mean() loss = F.smooth_l1_loss(predicted_actions, target_actions ).mean() loss.backward() actor_model.opt.step() return loss
if step_counter<INITIAL_RANDOM_STEPS or random()<epsilon or game%RANDOM_GAME_EVERY==0: action = env.action_space.sample() # print("random action") elif step_counter>=INITIAL_RANDOM_STEPS and game%NOISY_AGENT_GAME_EVERY ==0: if step%100==0: print("noisy agent acting") action = noisy_agent.get_actions(observation).cpu().detach().numpy() else: # import ipdb; ipdb.set_trace() action = agent.get_actions(observation).cpu().detach().numpy() observation_next, reward, done, info = env.step(action) # reward = reward*100 if step >= MAX_EPISODE_STEPS: done = True _sars = sars(observation,action,reward,observation_next,done,0.0) episode_sars.append(_sars) avg_reward.append([reward]) score += reward # if(reward==-100): # print("Adding -100 ",reward) if rb.index > INITIAL_RANDOM_STEPS and step_counter%TRAIN_CRITIC_EVERY_N_STEP==0: # import ipdb; ipdb.set_trace() # print("Training critic.") for s in range(CRITIC_TRAINING_ITTERATIONS): samples = rb.sample(CRITIC_TRAINING_SAMPLE_SIZE,step) train_critic(agent.critic_model, samples, env.action_space.shape[0]) if rb.index > INITIAL_RANDOM_STEPS and step_counter%TRAIN_ACTOR_EVERY_N_STEP==0: for s in range(ACTOR_TRAINING_ITTERTIONS): samples = rb.sample(ACTOR_TRAINING_SAMPLE_SIZE,0)