Exemple #1
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device('cpu')
    num_action = 2
    num_state = 4
    num_process = 5

    global_Actor = NeuralNet.ActorNet(inputs=num_state,
                                      outputs=num_action,
                                      num_hidden_layers=2,
                                      hidden_dim=8).to(device)
    #summary(global_Actor, input_size=(10,num_state))
    global_Critic = NeuralNet.CriticNet(inputs=num_state,
                                        outputs=1,
                                        num_hidden_layers=2,
                                        hidden_dim=8).to(device)
    #summary(global_Critic, input_size=(10,num_state))
    batch_size = 64
    GAMMA = 0.95
    max_episodes = 5000
    max_step = 1000
    global_Actor.share_memory()
    global_Critic.share_memory()

    processes = []
    processes_socket = []
    processes_agent = []
    mp.set_start_method('spawn')
    print("MP start method:", mp.get_start_method())

    ip = '110.76.78.109'
    port = 1111
    for rank in range(num_process):
        processes_socket.append(0)
        processes_socket[rank] = ClientSocket.MySocket(port, 'f', 'ffff?f')
        processes_agent.append(0)
        processes_agent[rank] = Agent.Brain(GlobalActorNet=global_Actor,
                                            GlobalCriticNet=global_Critic,
                                            device=device,
                                            socket=processes_socket[rank],
                                            num_action=num_action,
                                            max_episodes=max_episodes,
                                            max_step=max_step,
                                            batch_size=batch_size,
                                            GAMMA=GAMMA)
        p = mp.Process(target=processes_agent[rank].train, args=())
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
Exemple #2
0
def main():

    processes = []
    processes_socket = []
    processes_agent = []

    device = torch.device('cpu')
    num_action = 2
    num_state = 4
    num_process = 1

    batch_size = 64
    GAMMA = 0.95
    max_episodes = 5000
    max_step = 1000

    global_Actor = NeuralNet.ActorNet(inputs=num_state,
                                      outputs=num_action,
                                      num_hidden_layers=2,
                                      hidden_dim=8).to(device)
    global_Critic = NeuralNet.CriticNet(inputs=num_state,
                                        outputs=1,
                                        num_hidden_layers=2,
                                        hidden_dim=8).to(device)

    dic = torch.load(f"D:/modelDict/actor/modelDict.pt")
    global_Actor.load_state_dict(torch.load("D:/modelDict/actor/modelDict.pt"))
    global_Critic.load_state_dict(
        torch.load("D:/modelDict/critic/modelDict.pt"))

    port = 1111
    for rank in range(num_process):
        processes_socket.append(0)
        processes_socket[rank] = ClientSocket.MySocket(port, 'f', 'ffff?f')
        processes_agent.append(0)
        processes_agent[rank] = Agent.Brain(GlobalActorNet=global_Actor,
                                            GlobalCriticNet=global_Critic,
                                            device=device,
                                            socket=processes_socket[rank],
                                            num_action=num_action,
                                            max_episodes=max_episodes,
                                            max_step=max_step,
                                            batch_size=batch_size,
                                            GAMMA=GAMMA)
        p = mp.Process(target=processes_agent[rank].test,
                       args=(global_Actor, global_Critic))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
Exemple #3
0
def train(global_actor, global_critic, num_state, num_action, device, socket,
          max_episodes, max_step, batch_size, GAMMA):
    local_actor = NeuralNet.ActorNet(inputs=num_state,
                                     outputs=num_action,
                                     num_hidden_layers=3,
                                     hidden_dim=32).to(device)
    local_critic = NeuralNet.CriticNet(inputs=num_state,
                                       outputs=1,
                                       num_hidden_layers=3,
                                       hidden_dim=32).to(device)
    """
    print("local actor")
    for param_tensor in local_actor.state_dict():
        print(param_tensor, "\t", local_actor.state_dict()[param_tensor].size())
    print("global actor")
    for param_tensor in global_actor.state_dict():
        print(param_tensor, "\t", global_actor.state_dict()[param_tensor].size())
    """
    local_actor.load_state_dict(global_actor.state_dict())
    local_critic.load_state_dict(global_critic.state_dict())

    entropy_ceof = 0.001

    actor_optimizer = optim.Adam(local_actor.parameters())
    critic_optimizer = optim.Adam(local_critic.parameters())
    memory = Advantage.AdvantageMemory(batch_size,
                                       num_state,
                                       device=device,
                                       GAMMA=GAMMA,
                                       kind_action=1)

    for epi in range(max_episodes):
        local_actor.load_state_dict(global_actor.state_dict())
        local_critic.load_state_dict(global_critic.state_dict())
        state, reward, done, height = socket.getdata()
        local_actor.eval()
        local_critic.eval()

        for step in range(max_step):
            local_actor.eval()
            local_critic.eval()
            #local_actor.train()
            #local_critic.train()
            action_prob = local_actor(
                torch.from_numpy(state).float().to(device))
            #print(action_prob)
            action_distrib = Categorical(action_prob)
            action = action_distrib.sample()

            #print("start send data")
            socket.senddata(float(action.item()))
            next_state, reward, done, height = socket.getdata()

            if done is True:
                reward = -10
                mask = 0
            else:
                reward = 0
                #reward = (height - 3)/10
                mask = 1

            #print(epi," ",step," ",done)

            #print(action_prob[action],state, next_state, reward , mask)
            memory.input_data(state, next_state, reward, mask)
            state = next_state

            if memory.fill_batch():

                action_prob = local_actor(memory.states).float().to(device)
                action_distrib = Categorical(action_prob)
                action = action_distrib.sample()
                action = action.unsqueeze(1)
                #print('action is ', action)
                #print('action_prob before is ', action_prob)
                #print('action_prob is ', action_prob)
                action_prob = action_prob.gather(1, action)
                #print('action_prob is ', action_prob)

                state_value = local_critic(memory.states)
                next_state_value = local_critic(memory.next_states)

                #print('memory rewards',memory.rewards)
                #print('next_state value is ',next_state_value)
                Q = memory.rewards + GAMMA * next_state_value.detach(
                ) * memory.masks
                #print('memory.rewards',memory.rewards,'GAMMA',GAMMA,'next_state_value.detach()',next_state_value.detach())
                A = Q - state_value
                #print('Q is ',Q,  ' state value is ',state_value,' A is ',A)
                #print(memory.masks, Q)
                #print('next_state_value is ',next_state_value,'Q is ',Q)
                #print('A is ',A)

                local_actor.train()
                local_critic.train()

                critic_optimizer.zero_grad()
                critic_loss = F.smooth_l1_loss(state_value, Q)
                #print('critic loss is ', critic_loss)
                critic_loss.backward()
                critic_optimizer.step()
                global_critic.load_state_dict(local_critic.state_dict())
                log_prob = torch.log(action_prob + 1e-3)
                entropy = -log_prob * action_prob
                #print('action_prob is ', action_prob)
                #print('log_prob is ', log_prob)
                #print('A is ', A)
                #print('entropy_ceof * entropy',entropy_ceof * entropy)
                #print('-A.detach() * log_prob',-A.detach() * log_prob)

                actor_loss = -A.detach() * log_prob - entropy_ceof * entropy
                #print('Q is ', Q, ' state value is ', state_value, ' A is ', A,'actorloss ',actor_loss)
                #print('actor loss is ', -A.detach() * log_prob)
                actor_loss = torch.sum(actor_loss, -1)
                actor_loss = torch.sum(actor_loss, -1)
                #print('actor_loss is ', actor_loss)
                actor_loss /= len(action_prob)
                #print('actor_loss is ', actor_loss)
                #print('actor loss is ',actor_loss)
                actor_loss.backward()
                actor_optimizer.step()
                global_actor.load_state_dict(local_actor.state_dict())

            if done is True:
                print('epi : ', epi, " is end and step is :", step)
                break
Exemple #4
0
def train(global_actor, global_critic, num_state, num_action, device, socket, max_episodes, max_step, batch_size, GAMMA):
    local_actor  = NeuralNet.ActorNet(inputs = num_state, outputs = num_action,num_hidden_layers = 2 , hidden_dim = 8).to(device)
    local_critic = NeuralNet.CriticNet(inputs = num_state, outputs = 1,num_hidden_layers = 2 , hidden_dim = 8).to(device)
    """
    print("local actor")
    for param_tensor in local_actor.state_dict():
        print(param_tensor, "\t", local_actor.state_dict()[param_tensor].size())
    print("global actor")
    for param_tensor in global_actor.state_dict():
        print(param_tensor, "\t", global_actor.state_dict()[param_tensor].size())
    """
    local_actor.load_state_dict(global_actor.state_dict())
    local_critic.load_state_dict(global_critic.state_dict())

    entropy_ceof = 0.001

    actor_optimizer = optim.Adam(global_actor.parameters())
    critic_optimizer = optim.Adam(global_critic.parameters())
    memory= Advantage.AdvantageMemory( batch_size, num_state,device=device,GAMMA=GAMMA, kind_action = 1)

    for epi in range(max_episodes):
        state,reward,done,height = socket.getdata()
        local_actor.eval()
        local_critic.eval()

        for step in range(max_step):
            local_actor.eval()
            local_critic.eval()
            local_actor.load_state_dict(global_actor.state_dict())
            local_critic.load_state_dict(global_critic.state_dict())

            action_prob = local_actor(torch.from_numpy(state).float().to(device))
            action_distrib = Categorical(action_prob)
            action = action_distrib.sample()

            #print("start send data")
            socket.senddata(float(action.item()))
            next_state, reward, done, height = socket.getdata()

            if done is True :
                reward = -10
            else :
                reward = (height - 3)/10
            #print(epi," ",step," ",done)
            mask = 0 if done is True else 1

            #print(action_prob[action],state, next_state, reward , mask)
            memory.input_data(state,next_state,reward,mask)

            state = next_state

            if memory.fill_batch() :

                action_prob = local_actor(memory.states).float().to(device)
                action_distrib = Categorical(action_prob)
                action = action_distrib.sample()
                action_prob = action_prob[action]

                state_value = local_critic(memory.states)
                next_state_value = local_critic(memory.next_states)
                Q = memory.rewards + GAMMA * next_state_value.detach()*memory.masks
                A = Q - state_value

                local_actor.train()
                local_critic.train()

                critic_optimizer.zero_grad()
                critic_loss = F.mse_loss(state_value, Q.detach())
                critic_loss.backward()
                critic_optimizer.step()
                global_critic.load_state_dict(local_critic.state_dict())
                log_prob = torch.log(action_prob)
                entropy = -(log_prob * action_prob)

                actor_loss = -A.detach() * torch.log(action_prob) - entropy_ceof * entropy

                actor_loss = torch.sum(actor_loss,-1)
                actor_loss = torch.sum(actor_loss, -1)
                actor_loss /= len(action_prob)
                actor_loss.backward()
                actor_optimizer.step()
                global_actor.load_state_dict(local_actor.state_dict())

            if done is True :
                print('epi : ', epi, " is end and step is :",step)
                break