def main():
    # agent = PGagent(agentParam)
    # writers = [writer = SummaryWriter('runs/fashion_mnist_experiment_1')]
    n_agents = 2
    # multiPG = independentAgent([PGagent(agentParam) for i in range(n_agents)])
    multiPG = Agents([Centralised_AC(8,400) for i in range(n_agents)],400)  # create PGagents as well as a social agent
    # multiPG = Social_Agents([social_IAC(8,400,agentParam) for i in range(n_agents)],agentParam)
    for i_episode in range(101):
        n_state, ep_reward = env.reset(), 0  # reset the env
        for t in range(1, 500):
            actions = multiPG.choose_action(n_state)  # agent.select_action(state)   #select masked actions for every agent
            # actions = multiPG.select_masked_actions(n_state)
            n_state_, n_reward, _, _ = env.step(actions)  # interact with the env
            if args.render:  # render or not
                env.render()
            # multiPG.push_reward(n_reward)  # each agent receive their own reward, the law receive the summed reward
            ep_reward += sum(n_reward)  # record the total reward
            multiPG.update(n_state, n_reward, n_state_, actions)
            # multiPG.update_law()
            n_state = n_state_

        running_reward = ep_reward
        # loss = multiPG.update_agents()  # update the policy for each PGagent
        # multiPG.update_law()  # update the policy of law
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                i_episode, ep_reward, running_reward))
            logger.scalar_summary("ep_reward", ep_reward, i_episode)
Exemple #2
0
def main():
    writer = SummaryWriter('runs/' + envfolder + model_name)
    multiPGCen = CenAgents([
        Centralised_AC(action_dim,
                       state_dim,
                       add_para(i),
                       useLaw=True,
                       useCenCritc=False,
                       num_agent=n_agents) for i in range(n_agents)
    ], state_dim, agentParam)  # create PGagents as well as a social agent
    ## useCenCritc: use centra critic for normal agent
    multiPG = Agents([
        IAC(action_dim,
            state_dim,
            add_para(i),
            useLaw=True,
            useCenCritc=True,
            num_agent=n_agents) for i in range(n_agents)
    ])  # create PGagents as well as a social agent

    for i_episode in range(n_episode):
        n_state, ep_reward = env.reset(), 0  # reset the env
        #print(" lr .... ",multiPG.agents[0].optimizerA.param_groups[0]['lr'])
        for t in range(n_steps):
            if int(i_episode / line) % 2 == 0:  #((int(i_episode/line))%2==1):
                ## pis: output prob(detach()) only
                pis = multiPG.choose_indi_probs(n_state)
                actions = multiPGCen.choose_masked_actions(n_state, pis)
            else:
                mask_probs = multiPGCen.choose_indi_probs(n_state)
                actions = multiPG.choose_masked_actions(
                    n_state,
                    mask_probs)  #select masked actions for every agent
            n_state_, n_reward, _, _ = env.step(
                actions)  # interact with the env
            if args.render and i_episode % 30 == 0 and i_episode > 0:  # render or not
                env.render()
            ep_reward += sum(n_reward)  # record the total reward
            if int(i_episode / line) % 2 == 0:
                multiPGCen.update_share(n_state, n_reward, n_state_, actions)
            else:
                ## update_cent: update for centra normal agent
                multiPG.update_cent(n_state, n_reward, n_state_, actions)
                ## multiPG.update(n_state, n_reward, n_state_, actions)
            n_state = n_state_

        running_reward = ep_reward

        writer.add_scalar("ep_reward", ep_reward, i_episode)
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.
                  format(i_episode, ep_reward, running_reward))
            # logger.scalar_summary("ep_reward", ep_reward, i_episode)
        if i_episode % save_eps == 0 and i_episode > 11 and ifsave_model:
            multiPG.save(file_name)
            multiPGCen.save(file_name)
def main():
    # agent = PGagent(agentParam)
    writer = SummaryWriter('runs/iac_' + model_name)
    n_agents = 2
    state_dim = 400
    action_dim = 8
    # multiPG = independentAgent([PGagent(agentParam) for i in range(n_agents)])
    multiPGCen = CenAgents([
        Centralised_AC(8, state_dim, add_para(i), useLaw=True)
        for i in range(n_agents)
    ], state_dim, agentParam)  # create PGagents as well as a social agent
    #multiPG = Law_agent(action_dim,state_dim,agentParam,n_agents)
    multiPG = Agents([
        IAC(8, 400, add_para(i), useLaw=True) for i in range(n_agents)
    ])  # create PGagents as well as a social agent
    #multiPG = Social_Agents([social_IAC(8,400,agentParam) for i in range(n_agents)],agentParam)
    for i_episode in range(n_episode):
        #print(" =====================  ")
        n_state, ep_reward = env.reset(), 0  # reset the env
        for t in range(n_steps):
            #print(" =====================  ",n_state)
            if (int(i_episode / line) % 2 == True):  #i_episode<line:
                pis = multiPG.choose_indi_probs(n_state)
                actions = multiPGCen.choose_masked_actions(n_state, pis)
            else:
                mask_probs = multiPGCen.choose_mask_probs(n_state)
                actions = multiPG.choose_masked_actions(
                    n_state, mask_probs
                )  # agent.select_action(state)   #select masked actions for every agent
            # actions = multiPG.select_masked_actions(n_state)
            n_state_, n_reward, _, _ = env.step(
                actions)  # interact with the env
            if args.render and i_episode % 50 == 0:  # render or not
                env.render()
            # multiPG.push_reward(n_reward)  # each agent receive their own reward, the law receive the summed reward
            ep_reward += sum(n_reward)  # record the total reward
            if (int(i_episode / line) % 2 == True):
                #multiPGCen.update(n_state, n_reward, n_state_, actions)
                multiPGCen.update_share(n_state, n_reward, n_state_, actions)
            else:
                multiPG.update(n_state, n_reward, n_state_, actions)
            # multiPG.update_law()
            n_state = n_state_

        running_reward = ep_reward
        # loss = multiPG.update_agents()  # update the policy for each PGagent
        # multiPG.update_law()  # update the policy of law
        writer.add_scalar("ep_reward", ep_reward, i_episode)
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.
                  format(i_episode, ep_reward, running_reward))
            # logger.scalar_summary("ep_reward", ep_reward, i_episode)
        if i_episode % save_eps == 0 and i_episode > 11 and ifsave_model:
            multiPG.save(file_name)
Exemple #4
0
def main():
    # agent = PGagent(agentParam)
    writer = SummaryWriter('runs/' + envfolder + model_name)
    ######  law only:
    multiPGCen = CenAgents([
        Centralised_AC(action_dim,
                       state_dim,
                       add_para(i),
                       useLaw=False,
                       useCenCritc=useCenCritc,
                       num_agent=n_agents) for i in range(n_agents)
    ], state_dim, agentParam)  # create PGagents as well as a social agent
    # multiPG = Agents([IAC(action_dim,state_dim,add_para(i),useLaw=False,useCenCritc=useCenCritc,num_agent=n_agents) for i in range(n_agents)])  # create PGagents as well as a social agent
    for i_episode in range(n_episode):
        n_state, ep_reward = env.reset(), 0  # reset the env
        for t in range(n_steps):
            actions = multiPGCen.choose_actions(n_state)
            #actions = multiPG.choose_actions(n_state)
            n_state_, n_reward, _, _ = env.step(
                actions)  # interact with the env
            if args.render and i_episode % 10 == 0 and i_episode > 0:  # render or not
                env.render()
                time.sleep(0.1)
            ep_reward += sum(n_reward)  # record the total reward
            '''
            if CentQ:
                multiPG.update_cent(n_state, n_reward, n_state_, actions)
            else:
                multiPG.update(n_state, n_reward, n_state_, actions)
            '''
            ######  law only:
            multiPGCen.update_share(n_state, n_reward, n_state_, actions)
            n_state = n_state_

        running_reward = ep_reward

        writer.add_scalar("ep_reward", ep_reward, i_episode)
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.
                  format(i_episode, ep_reward, running_reward))
            # logger.scalar_summary("ep_reward", ep_reward, i_episode)
        if i_episode % save_eps == 0 and i_episode > 11 and ifsave_model:
            ######  law only:
            multiPGCen.save(file_name)
Exemple #5
0
def main():
    # [height*4+1,3]
    # agent = PGagent(agentParam)
    state_dim = height * 4 + 1
    writer = SummaryWriter('runs/iac_' + model_name)
    # multiPG = independentAgent([PGagent(agentParam) for i in range(n_agents)])
    #multiPG = Agents([IAC(3,height*4+1,add_para(i)) for i in range(n_agents)])  # create PGagents as well as a social agent
    # multiPG = Social_Agents([social_IAC(8,400,agentParam) for i in range(n_agents)],agentParam)
    multiPG = CenAgents(
        [Centralised_AC(3, state_dim, add_para(i)) for i in range(n_agents)],
        state_dim, agentParam)  # create PGagents as well as a social agent
    for i_episode in range(100, n_episode):
        #print(" =====================  ")
        n_state, ep_reward = env.reset(), 0  # reset the env
        for t in range(n_steps):
            #print(" =====================  ")

            actions = multiPG.choose_action(
                n_state
            )  # agent.select_action(state)   #select masked actions for every agent
            # actions = multiPG.select_masked_actions(n_state)
            n_state_, n_reward, _, _ = env.step(
                actions)  # interact with the env
            #if args.render and i_episode%20==0:  # render or not
            #    env.render()
            # multiPG.push_reward(n_reward)  # each agent receive their own reward, the law receive the summed reward
            ep_reward += sum(n_reward)  # record the total reward
            multiPG.update(n_state, n_reward, n_state_, actions)
            # multiPG.update_law()
            n_state = n_state_

        running_reward = ep_reward
        # loss = multiPG.update_agents()  # update the policy for each PGagent
        # multiPG.update_law()  # update the policy of law
        writer.add_scalar("ep_reward", ep_reward, i_episode)
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.
                  format(i_episode, ep_reward, running_reward))
            # logger.scalar_summary("ep_reward", ep_reward, i_episode)
        if i_episode % save_eps == 0 and i_episode > 15 and ifsave_model:
            multiPG.save(file_name)
Exemple #6
0
def AC_main():
    #agent = PGagent(agentParam)
    all_rw = []
    #n_agents = 1#2
    if mode == "social":
        multiPG = socialAgents([
            PGagent(env_dim["cleanup"][0], env_dim["cleanup"][1], add_para(i))
            for i in range(n_agents)
        ], agentParam)
    elif mode == "AC":
        multiPG = Agents([Centralised_AC(4, 100) for i in range(n_agents)], 50)
    else:
        multiPG = independentAgent([
            PGagent(env_dim["cleanup"][0], env_dim["cleanup"][1], add_para(i))
            for i in range(n_agents)
        ])

    for i_episode in range(1000):
        n_state, ep_reward = env.reset(), 0
        n_state = n_state[0]
        test_reward_sum = 0
        for t in range(1000):

            if mode == "social":
                actions = multiPG.select_mask_actions(n_state)
            else:
                actions = multiPG.choose_action(
                    process_state(n_state))  ##agent.select_action(state)
            #actions = [ random.randint(0,7) for i in range(n_agents)]
            a = deepcopy(actions)
            for i in range(len(actions)):
                a[i] = [actions[i][0]]
            n_state_, n_reward, _, _, test_reward = env.step(a)
            test_reward_sum += test_reward
            if render and i_episode != 1:
                # env.render(impath,t)
                env.render()
            # time.sleep(0.05)
            #multiPG.push_reward(n_reward)
            ep_reward += sum(n_reward)
            # if [1] in process_state(n_reward):
            #     print("i_episode %d:"%i_episode,process_state(n_reward))
            multiPG.update(process_state(n_state), process_state(n_reward),
                           process_state(n_state_), actions)
            n_state = n_state_
        running_reward = ep_reward
        #if test_mode == False:
        #    multiPG.update_agents()

        all_rw.append(ep_reward)
        if i_episode % (args.log_interval * 2) == 0 and ifsave_data:
            np.save("data/" + model_name + ".npy", all_rw)
        if i_episode % args.log_interval == 0:
            print(
                'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\ttest_reward: {:.2f}'
                .format(i_episode, ep_reward[0], running_reward[0],
                        test_reward_sum))
            logger.scalar_summary("ep_reward", ep_reward, i_episode)
            logger.scalar_summary("coin_eaten", test_reward_sum, i_episode)

        if i_episode % save_eps == 0 and i_episode > 10 and ifsave_model:
            multiPG.save(file_name)