def main(arglist): ACTORS = 1 env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode) if arglist.eval: current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) writer = SummaryWriter(log_dir='./logs/' + current_time + '-' + arglist.scenario) maddpg_wrapper = MADDPG(ACTORS) maddpg_wrapper.create_agents(env, arglist) j = 0 for episode in range(arglist.max_episode): obs = env.reset() terminal = False maddpg_wrapper.reset() total_reward = [0 for i in maddpg_wrapper.workers] step = 0 while not terminal and step < 25: if not arglist.eval: env.render(0) time.sleep(0.03) actions = maddpg_wrapper.take_actions(obs) obs2, reward, done = env.step(actions) for actor in range(ACTORS): for i, rew in enumerate(reward[actor]): total_reward[i] += rew j += ACTORS #terminal = all(done) if arglist.eval: maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2, done) obs = obs2 step += 1 if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0: maddpg_wrapper.save(episode) if arglist.eval: for worker, ep_ave_max in zip(maddpg_wrapper.workers, maddpg_wrapper.ep_ave_max_q_value): print(worker.pos, ' => average_max_q: ', ep_ave_max / float(step), ' Reward: ', total_reward[worker.pos], ' Episode: ', episode) writer.add_scalar( str(worker.pos) + '/Average_max_q', ep_ave_max / float(step), episode) writer.add_scalar( str(worker.pos) + '/Reward Agent', total_reward[worker.pos], episode) env.close()
def trainFunction(state_size, action_size, n_episodes=4000, num_agents=2): magent = MADDPG(action_size=action_size, noise_start=1.0, seed=2, gamma=0.99, t_stop_noise=30000) scores = [] scores_deque = deque(maxlen=100) scores_avg = [] for i_episode in range(1, n_episodes + 1): rewards = [] env_info = env.reset( train_mode=True)[brain_name] # reset the environment states = env_info.vector_observations # get the current state (for each agent) if i_episode % 2: update = True # loop over steps while True: # select an action joint_actions = magent.act(states, update) update = False # take action in environment and set parameters to new values env_info = env.step(joint_actions)[brain_name] next_states = env_info.vector_observations rewards_v = env_info.rewards done_v = env_info.local_done # update and train agent with returned information magent.step(states, joint_actions, rewards_v, next_states, done_v) states = next_states rewards.append(rewards_v) if any(done_v): break # calculate episode reward as maximum of individually collected rewards of agents episode_reward = np.max(np.sum(np.array(rewards), axis=0)) scores.append( episode_reward) # save most recent score to overall score array scores_deque.append( episode_reward ) # save most recent score to running window of 100 last scores current_avg_score = np.mean(scores_deque) scores_avg.append( current_avg_score ) # save average of last 100 scores to average score array print('\rEpisode {}\tAverage Score: {:.3f}'.format( i_episode, current_avg_score), end="") # log average score every 200 episodes if i_episode % 200 == 0: print('\rEpisode {}\tAverage Score: {:.3f}'.format( i_episode, current_avg_score)) # break and report success if environment is solved if np.mean(scores_deque) >= .5 and i_episode % 200 == 0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}' .format(i_episode, np.mean(scores_deque))) magent.save()
def update(): if ALGORITHM == 'maddpg': ddpg = MADDPG(avs.n_actions, avs.n_features, 1, 'maddpg model', RETRAIN) elif ALGORITHM == 'ddpg': ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) else: ddpg = DDPG(avs.n_actions, avs.n_features, 1, 'ddpg model', RETRAIN) t1 = time.time() rewards1 = 0 rewards2 = 0 var = VAR collision = 0 avgreward1 = [] avgreward2 = [] collision_percentage = [] for i in range(MAX_EPISODES): s1, s2 = avs.reset() ep_reward1 = 0 ep_reward2 = 0 if i % 100000 == 0 and i > IMITATION_EPISODE: plot(avgreward1, avgreward2, collision_percentage, i) for j in range(MAX_EP_STEPS): if RENDER: avs.render() # Add exploration noise if i < IMITATION_EPISODE or i % 4 == 0: a1 = imitation(avs.agent1, avs.agent2, avs.target1) a2 = imitation(avs.agent2, avs.agent1, avs.target2) else: # add randomness to action selection for exploration a1 = ddpg.choose_action(s1) a1 = [ np.clip(np.random.normal(a1[0], var), -1, 1), np.clip(np.random.normal(a1[1], var), -1, 1) ] a2 = ddpg.choose_action(s2) a2 = [ np.clip(np.random.normal(a2[0], var), -1, 1), np.clip(np.random.normal(a2[1], var), -1, 1) ] # a2 = imitation(avs.agent2, avs.agent1, avs.target2) if DEBUG: time.sleep(0.1) s_1, r1, s_2, r2, done, info = avs.step(a1, a2) if ALGORITHM == 'ddpg': ddpg.store_transition(s1, a1, r1, s_1) ddpg.store_transition(s2, a2, r2, s_2) else: ddpg.store_transition(s1, s2, a1, a2, r1, s_1, s_2) ddpg.store_transition(s2, s1, a2, a1, r2, s_2, s_1) s1 = s_1 s2 = s_2 ep_reward1 += r1 ep_reward2 += r2 if j == MAX_EP_STEPS - 1 or done: print("pt:", ddpg.pointer) print('Episode:', i, 'Step:', j, ' Reward: %i' % int(ep_reward1), int(ep_reward2), 'Explore: %.2f' % var) if i >= IMITATION_EPISODE: rewards1 += ep_reward1 rewards2 += ep_reward2 if r1 < -100: collision += 1 if (i + 1) % 100 == 0: avgreward1.append(rewards1 / 100) avgreward2.append(rewards2 / 100) collision_percentage.append(collision) rewards1 = 0 rewards2 = 0 collision = 0 break if ddpg.pointer > MEMORY_CAPACITY: ddpg.learn() ddpg.learn() if var > MIN_VAR and i > IMITATION_EPISODE: var *= DECAY # decay the action randomness if i % 4 != 0 and ep_reward1 > 100 and ep_reward2 > 100 and i > IMITATION_EPISODE: ddpg.save(i) print('Running time: ', time.time() - t1)
] # handle invalid dir char for i in range(len(model_names)): model_names[i] = model_names[i].replace('[', '').replace(']', '').replace( ' ', '').replace(',', '_') # handle standard arg, i.e., {} model_names = ['standard' if name == '' else name for name in model_names] # model loop for i in trange(len(args), desc='model', leave=True): model_dir = '{}/{}'.format(root, model_names[i]) os.mkdir(model_dir) # log cmd with open('{}/cmd_config.txt'.format(model_dir), 'w') as f: for k, v in control_args.items(): f.write(str(k) + ': ' + str(v) + '\n') arg = args[i] # repeat loop for n in trange(control_args['repeat'], desc='repeat', leave=True): dir = '{}/{}'.format(model_dir, n) os.mkdir(dir) maddpg = MADDPG(env, **arg) if control_args.has_key('load'): model_path = control_args['load'] maddpg.load_actor(model_path) maddpg.load_critic(model_path) if control_args['train']: maddpg.train(dir, control_args['save_interval']) maddpg.save(dir) maddpg.test(dir, n=control_args['n_test'])