Esempio n. 1
0
def train_actor(actor_model, critic_model,noisy_actor_model, state_transitions, num_actor_training_samples, num_actions):
    #for each observation get the critic to generate the Q value corresponding to each action_space
    #retain action observation pairs corresponding to the highest Q values
    #train the actor to converge towards that set

    #Generate random actions
    random_actions = []
    for i in range(num_actor_training_samples):
        random_actions.append( np.random.rand(num_actions)*2-1 )
        # import ipdb; ipdb.set_trace()
    #Get random observations
    for i in range(len(state_transitions)):
        random_actions.append(state_transitions[i].action)
    random_states = [s.state for s in state_transitions]

    # import ipdb; ipdb.set_trace()

    # for earch state add the best corresponding action to random actions
    for i in range(len(random_states)):
        with torch.no_grad():
            act = actor_model(torch.Tensor(random_states[i]).to(actor_model.device)) .cpu().detach().numpy()
            random_actions.append(act)
            act = noisy_actor_model(torch.Tensor(random_states[i]).to(noisy_actor_model.device)) .cpu().detach().numpy()
            random_actions.append(act)



    best_state_action = []
    for i_states in range(len(random_states)):
        QAs = []

        # get the Qvalues from the random actions
        for i_actions in range(len(random_actions)):
            with torch.no_grad():
                qval = critic_model( torch.Tensor(   torch.cat( (torch.Tensor(random_states[i_states]),torch.Tensor(random_actions[i_actions])),0 )    ).to(critic_model.device) ).cpu()
                QAs.append( qval )
        # get index for best actions between all random actions and the actor's predicted actions
        #_sars = sars(observation,action,reward,observation_next,done,0.0)
        best_state_action.append(sars(random_states[i_states], random_actions[np.argmax(QAs)],0.0,None,False,np.max(QAs) ))
    # import ipdb;ipdb.set_trace()

    t_random_states = torch.stack( ([torch.Tensor(s.state) for s in best_state_action]) ).to(actor_model.device)
    target_actions = torch.stack( ([torch.Tensor(s.action) for s in best_state_action]) ).to(actor_model.device)
    actor_model.zero_grad()
    predicted_actions = actor_model(t_random_states)

    # loss = F.smooth_l1_loss(torch.sum(pred_qvals*one_hot_actions,-1), actual_Q_values.view(-1) )
    # loss =  torch.sqrt(torch.sum( (predicted_actions - target_actions)**2,1 ) )    .mean()
    loss = F.smooth_l1_loss(predicted_actions, target_actions ).mean()
    loss.backward()
    actor_model.opt.step()
    return loss
Esempio n. 2
0
            if step_counter<INITIAL_RANDOM_STEPS or random()<epsilon or game%RANDOM_GAME_EVERY==0:
                action = env.action_space.sample()
                # print("random action")
            elif step_counter>=INITIAL_RANDOM_STEPS and  game%NOISY_AGENT_GAME_EVERY ==0:
                if step%100==0:
                    print("noisy agent acting")
                action = noisy_agent.get_actions(observation).cpu().detach().numpy()
            else:
                # import ipdb; ipdb.set_trace()
                action = agent.get_actions(observation).cpu().detach().numpy()

            observation_next, reward, done, info = env.step(action)
            # reward = reward*100
            if step >= MAX_EPISODE_STEPS:
                done = True
            _sars = sars(observation,action,reward,observation_next,done,0.0)
            episode_sars.append(_sars)
            avg_reward.append([reward])
            score += reward
            # if(reward==-100):
            #     print("Adding -100 ",reward)
            if rb.index > INITIAL_RANDOM_STEPS and step_counter%TRAIN_CRITIC_EVERY_N_STEP==0:
                # import ipdb; ipdb.set_trace()
                # print("Training critic.")
                for s in range(CRITIC_TRAINING_ITTERATIONS):
                    samples = rb.sample(CRITIC_TRAINING_SAMPLE_SIZE,step)
                    train_critic(agent.critic_model, samples, env.action_space.shape[0])

            if rb.index > INITIAL_RANDOM_STEPS and step_counter%TRAIN_ACTOR_EVERY_N_STEP==0:
                for s in range(ACTOR_TRAINING_ITTERTIONS):
                    samples = rb.sample(ACTOR_TRAINING_SAMPLE_SIZE,0)