def main(): # agent = PGagent(agentParam) # writers = [writer = SummaryWriter('runs/fashion_mnist_experiment_1')] n_agents = 2 # multiPG = independentAgent([PGagent(agentParam) for i in range(n_agents)]) multiPG = Agents([Centralised_AC(8,400) for i in range(n_agents)],400) # create PGagents as well as a social agent # multiPG = Social_Agents([social_IAC(8,400,agentParam) for i in range(n_agents)],agentParam) for i_episode in range(101): n_state, ep_reward = env.reset(), 0 # reset the env for t in range(1, 500): actions = multiPG.choose_action(n_state) # agent.select_action(state) #select masked actions for every agent # actions = multiPG.select_masked_actions(n_state) n_state_, n_reward, _, _ = env.step(actions) # interact with the env if args.render: # render or not env.render() # multiPG.push_reward(n_reward) # each agent receive their own reward, the law receive the summed reward ep_reward += sum(n_reward) # record the total reward multiPG.update(n_state, n_reward, n_state_, actions) # multiPG.update_law() n_state = n_state_ running_reward = ep_reward # loss = multiPG.update_agents() # update the policy for each PGagent # multiPG.update_law() # update the policy of law if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( i_episode, ep_reward, running_reward)) logger.scalar_summary("ep_reward", ep_reward, i_episode)
def main(): writer = SummaryWriter('runs/' + envfolder + model_name) multiPGCen = CenAgents([ Centralised_AC(action_dim, state_dim, add_para(i), useLaw=True, useCenCritc=False, num_agent=n_agents) for i in range(n_agents) ], state_dim, agentParam) # create PGagents as well as a social agent ## useCenCritc: use centra critic for normal agent multiPG = Agents([ IAC(action_dim, state_dim, add_para(i), useLaw=True, useCenCritc=True, num_agent=n_agents) for i in range(n_agents) ]) # create PGagents as well as a social agent for i_episode in range(n_episode): n_state, ep_reward = env.reset(), 0 # reset the env #print(" lr .... ",multiPG.agents[0].optimizerA.param_groups[0]['lr']) for t in range(n_steps): if int(i_episode / line) % 2 == 0: #((int(i_episode/line))%2==1): ## pis: output prob(detach()) only pis = multiPG.choose_indi_probs(n_state) actions = multiPGCen.choose_masked_actions(n_state, pis) else: mask_probs = multiPGCen.choose_indi_probs(n_state) actions = multiPG.choose_masked_actions( n_state, mask_probs) #select masked actions for every agent n_state_, n_reward, _, _ = env.step( actions) # interact with the env if args.render and i_episode % 30 == 0 and i_episode > 0: # render or not env.render() ep_reward += sum(n_reward) # record the total reward if int(i_episode / line) % 2 == 0: multiPGCen.update_share(n_state, n_reward, n_state_, actions) else: ## update_cent: update for centra normal agent multiPG.update_cent(n_state, n_reward, n_state_, actions) ## multiPG.update(n_state, n_reward, n_state_, actions) n_state = n_state_ running_reward = ep_reward writer.add_scalar("ep_reward", ep_reward, i_episode) if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. format(i_episode, ep_reward, running_reward)) # logger.scalar_summary("ep_reward", ep_reward, i_episode) if i_episode % save_eps == 0 and i_episode > 11 and ifsave_model: multiPG.save(file_name) multiPGCen.save(file_name)
def main(): # agent = PGagent(agentParam) writer = SummaryWriter('runs/iac_' + model_name) n_agents = 2 state_dim = 400 action_dim = 8 # multiPG = independentAgent([PGagent(agentParam) for i in range(n_agents)]) multiPGCen = CenAgents([ Centralised_AC(8, state_dim, add_para(i), useLaw=True) for i in range(n_agents) ], state_dim, agentParam) # create PGagents as well as a social agent #multiPG = Law_agent(action_dim,state_dim,agentParam,n_agents) multiPG = Agents([ IAC(8, 400, add_para(i), useLaw=True) for i in range(n_agents) ]) # create PGagents as well as a social agent #multiPG = Social_Agents([social_IAC(8,400,agentParam) for i in range(n_agents)],agentParam) for i_episode in range(n_episode): #print(" ===================== ") n_state, ep_reward = env.reset(), 0 # reset the env for t in range(n_steps): #print(" ===================== ",n_state) if (int(i_episode / line) % 2 == True): #i_episode<line: pis = multiPG.choose_indi_probs(n_state) actions = multiPGCen.choose_masked_actions(n_state, pis) else: mask_probs = multiPGCen.choose_mask_probs(n_state) actions = multiPG.choose_masked_actions( n_state, mask_probs ) # agent.select_action(state) #select masked actions for every agent # actions = multiPG.select_masked_actions(n_state) n_state_, n_reward, _, _ = env.step( actions) # interact with the env if args.render and i_episode % 50 == 0: # render or not env.render() # multiPG.push_reward(n_reward) # each agent receive their own reward, the law receive the summed reward ep_reward += sum(n_reward) # record the total reward if (int(i_episode / line) % 2 == True): #multiPGCen.update(n_state, n_reward, n_state_, actions) multiPGCen.update_share(n_state, n_reward, n_state_, actions) else: multiPG.update(n_state, n_reward, n_state_, actions) # multiPG.update_law() n_state = n_state_ running_reward = ep_reward # loss = multiPG.update_agents() # update the policy for each PGagent # multiPG.update_law() # update the policy of law writer.add_scalar("ep_reward", ep_reward, i_episode) if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. format(i_episode, ep_reward, running_reward)) # logger.scalar_summary("ep_reward", ep_reward, i_episode) if i_episode % save_eps == 0 and i_episode > 11 and ifsave_model: multiPG.save(file_name)
def main(): # agent = PGagent(agentParam) writer = SummaryWriter('runs/' + envfolder + model_name) ###### law only: multiPGCen = CenAgents([ Centralised_AC(action_dim, state_dim, add_para(i), useLaw=False, useCenCritc=useCenCritc, num_agent=n_agents) for i in range(n_agents) ], state_dim, agentParam) # create PGagents as well as a social agent # multiPG = Agents([IAC(action_dim,state_dim,add_para(i),useLaw=False,useCenCritc=useCenCritc,num_agent=n_agents) for i in range(n_agents)]) # create PGagents as well as a social agent for i_episode in range(n_episode): n_state, ep_reward = env.reset(), 0 # reset the env for t in range(n_steps): actions = multiPGCen.choose_actions(n_state) #actions = multiPG.choose_actions(n_state) n_state_, n_reward, _, _ = env.step( actions) # interact with the env if args.render and i_episode % 10 == 0 and i_episode > 0: # render or not env.render() time.sleep(0.1) ep_reward += sum(n_reward) # record the total reward ''' if CentQ: multiPG.update_cent(n_state, n_reward, n_state_, actions) else: multiPG.update(n_state, n_reward, n_state_, actions) ''' ###### law only: multiPGCen.update_share(n_state, n_reward, n_state_, actions) n_state = n_state_ running_reward = ep_reward writer.add_scalar("ep_reward", ep_reward, i_episode) if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. format(i_episode, ep_reward, running_reward)) # logger.scalar_summary("ep_reward", ep_reward, i_episode) if i_episode % save_eps == 0 and i_episode > 11 and ifsave_model: ###### law only: multiPGCen.save(file_name)
def main(): # [height*4+1,3] # agent = PGagent(agentParam) state_dim = height * 4 + 1 writer = SummaryWriter('runs/iac_' + model_name) # multiPG = independentAgent([PGagent(agentParam) for i in range(n_agents)]) #multiPG = Agents([IAC(3,height*4+1,add_para(i)) for i in range(n_agents)]) # create PGagents as well as a social agent # multiPG = Social_Agents([social_IAC(8,400,agentParam) for i in range(n_agents)],agentParam) multiPG = CenAgents( [Centralised_AC(3, state_dim, add_para(i)) for i in range(n_agents)], state_dim, agentParam) # create PGagents as well as a social agent for i_episode in range(100, n_episode): #print(" ===================== ") n_state, ep_reward = env.reset(), 0 # reset the env for t in range(n_steps): #print(" ===================== ") actions = multiPG.choose_action( n_state ) # agent.select_action(state) #select masked actions for every agent # actions = multiPG.select_masked_actions(n_state) n_state_, n_reward, _, _ = env.step( actions) # interact with the env #if args.render and i_episode%20==0: # render or not # env.render() # multiPG.push_reward(n_reward) # each agent receive their own reward, the law receive the summed reward ep_reward += sum(n_reward) # record the total reward multiPG.update(n_state, n_reward, n_state_, actions) # multiPG.update_law() n_state = n_state_ running_reward = ep_reward # loss = multiPG.update_agents() # update the policy for each PGagent # multiPG.update_law() # update the policy of law writer.add_scalar("ep_reward", ep_reward, i_episode) if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. format(i_episode, ep_reward, running_reward)) # logger.scalar_summary("ep_reward", ep_reward, i_episode) if i_episode % save_eps == 0 and i_episode > 15 and ifsave_model: multiPG.save(file_name)
def AC_main(): #agent = PGagent(agentParam) all_rw = [] #n_agents = 1#2 if mode == "social": multiPG = socialAgents([ PGagent(env_dim["cleanup"][0], env_dim["cleanup"][1], add_para(i)) for i in range(n_agents) ], agentParam) elif mode == "AC": multiPG = Agents([Centralised_AC(4, 100) for i in range(n_agents)], 50) else: multiPG = independentAgent([ PGagent(env_dim["cleanup"][0], env_dim["cleanup"][1], add_para(i)) for i in range(n_agents) ]) for i_episode in range(1000): n_state, ep_reward = env.reset(), 0 n_state = n_state[0] test_reward_sum = 0 for t in range(1000): if mode == "social": actions = multiPG.select_mask_actions(n_state) else: actions = multiPG.choose_action( process_state(n_state)) ##agent.select_action(state) #actions = [ random.randint(0,7) for i in range(n_agents)] a = deepcopy(actions) for i in range(len(actions)): a[i] = [actions[i][0]] n_state_, n_reward, _, _, test_reward = env.step(a) test_reward_sum += test_reward if render and i_episode != 1: # env.render(impath,t) env.render() # time.sleep(0.05) #multiPG.push_reward(n_reward) ep_reward += sum(n_reward) # if [1] in process_state(n_reward): # print("i_episode %d:"%i_episode,process_state(n_reward)) multiPG.update(process_state(n_state), process_state(n_reward), process_state(n_state_), actions) n_state = n_state_ running_reward = ep_reward #if test_mode == False: # multiPG.update_agents() all_rw.append(ep_reward) if i_episode % (args.log_interval * 2) == 0 and ifsave_data: np.save("data/" + model_name + ".npy", all_rw) if i_episode % args.log_interval == 0: print( 'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}\ttest_reward: {:.2f}' .format(i_episode, ep_reward[0], running_reward[0], test_reward_sum)) logger.scalar_summary("ep_reward", ep_reward, i_episode) logger.scalar_summary("coin_eaten", test_reward_sum, i_episode) if i_episode % save_eps == 0 and i_episode > 10 and ifsave_model: multiPG.save(file_name)