def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_theta_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_mu_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_sigma_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_dt_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_x0_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 jsonFile = [] coords = [] fullyBreak = False while steps < 300: steps += 1 # render if args.render: env.render() time.sleep(0.05) if args.dump_file: # NIJE STO JE NASE... frame = env.dump_file() coords.append(frame) print("Radim...\n") # act actions = [] for i in range(env.n): action = np.clip( actors[i].choose_action(states[i]) + actors_noise[i](), -2, 2) actions.append(action) # step states_next, rewards, done, info = env.step(actions) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 500 memories[i].remember(states, actions, rewards[i], states_next, done[i]) if memories[i].pointer > batch_size * 10: s, a, r, sn, _ = memories[i].sample(batch, env.n) r = np.reshape(r, (batch_size, 1)) loss = critics[i].learn(s, a, r, sn) actors[i].learn(actors, s) episode_losses[i] += loss else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistic.extend([actors_noise[i].theta for i in range(env.n)]) statistic.extend([actors_noise[i].mu for i in range(env.n)]) statistic.extend([actors_noise[i].sigma for i in range(env.n)]) statistic.extend([actors_noise[i].dt for i in range(env.n)]) statistic.extend([actors_noise[i].x0 for i in range(env.n)]) statistics.add_statistics(statistic) # NIJE STO JE NASE... if args.dump_file: fileNum = 1 print("Pravim...\n") with open("results/coords.txt", "w+") as f: f.write(str(len(coords[0]) / 2)) f.write("\n") for fr in coords: print("Pisem...\n") f.write(" ".join(str(i) for i in fr)) f.write("\n") fullyBreak = True break #fileNum += 1 coords = [] if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) if not os.path.exists(weights_filename_prefix): os.makedirs(weights_filename_prefix) save_path = saver.save(session, os.path.join(weights_filename_prefix, "models"), global_step=episode) print("saving model to {}".format(save_path)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) if fullyBreak: break return statistics
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_theta_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_mu_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_sigma_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_dt_{}".format(i) for i in range(env.n)]) statistics_header.extend(["ou_x0_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() # act actions = [] for i in range(env.n): action = np.clip( actors[i].choose_action(states[i]) + actors_noise[i](), -2, 2) actions.append(action) # step states_next, rewards, done, info = env.step(actions) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 50 memories[i].remember(states[i], actions[i], rewards[i], states_next[i], done[i]) if memories[i].pointer > batch_size * 10: s, a, r, sn, _ = memories[i].sample(batch) r = np.reshape(r, (batch_size, 1)) loss = critics[i].learn(s, a, r, sn) actors[i].learn(s) episode_losses[i] += loss else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistic.extend([actors_noise[i].theta for i in range(env.n)]) statistic.extend([actors_noise[i].mu for i in range(env.n)]) statistic.extend([actors_noise[i].sigma for i in range(env.n)]) statistic.extend([actors_noise[i].dt for i in range(env.n)]) statistic.extend([actors_noise[i].x0 for i in range(env.n)]) statistics.add_statistics(statistic) if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) if not os.path.exists(weights_filename_prefix): os.makedirs(weights_filename_prefix) save_path = saver.save(session, os.path.join(weights_filename_prefix, "models"), global_step=episode) print("saving model to {}".format(save_path)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() time.sleep(0.1) # act actions = [] actions_onehot = [] for i in range(env.n): action = dqns[i].choose_action(states[i]) speed = 0.9 if env.agents[i].adversary else 1 onehot_action = np.zeros(n_actions[i]) onehot_action[action] = speed actions_onehot.append(onehot_action) actions.append(action) # step states_next, rewards, done, info = env.step(actions_onehot) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.n): if done[i]: rewards[i] -= 50 memories[i].remember(states[i], actions[i], rewards[i], states_next[i], done[i]) if memories[i].pointer > batch_size * 10: history = dqns[i].learn(*memories[i].sample(batch)) episode_losses[i] += history.history["loss"][0] else: episode_losses[i] = -1 states = states_next episode_rewards += rewards collision_count += np.array( simple_tag_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend([dqns[i].eps_greedy for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistics.add_statistics(statistic) if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) general_utilities.save_dqn_weights( dqns, "{}_{}_".format(weights_filename_prefix, episode)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.append("done") statistics_header.append("reward") statistics_header.extend( ["loss_{}".format(i) for i in range(env.num_agents)]) statistics_header.extend( ["eps_greedy_{}".format(i) for i in range(env.num_agents)]) statistics_header.extend( ["Agent Energy Left_{}".format(i) for i in range(env.num_agents)]) statistics_header.extend( ["Task Energy Left_{}".format(i) for i in range(env.num_agents)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() # episode_losses = np.zeros(env.n) # episode_rewards = np.zeros(env.n) # collision_count = np.zeros(env.n) episode_losses = np.zeros(env.num_agents) episode_rewards = 0 steps = 0 all_states = [states] while steps <= 600: steps += 1 # render # if args.render: # env._render() # act actions = [] # n represents agents' number for i in range(env.num_agents): action = dqns[i].choose_action(states) actions.append(action) # step states_next, rewards, done, info = env.step(actions) all_states.append(states_next) # learn if not args.testing: size = memories[0].pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) for i in range(env.num_agents): memories[i].remember(states, actions[i], rewards, states_next, done) if memories[i].pointer > batch_size * 10: history = dqns[i].learn(*memories[i].sample(batch)) episode_losses[i] += history.history["loss"][0] else: for i in range(env.num_agents): episode_losses[i] = -1 states = states_next episode_rewards += rewards # reset states if done if done or steps >= 600: episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.append(done) statistic.append(episode_rewards) statistic.extend( [episode_losses[i] for i in range(env.num_agents)]) statistic.extend( [dqns[i].eps_greedy for i in range(env.num_agents)]) statistic.extend([env.B_k[i] for i in range(env.num_agents)]) statistic.extend([env.T_i[i] for i in range(env.num_agents)]) statistics.add_statistics(statistic) if episode % 1 == 0: print(statistics.summarize_last()) if done: with open('/save/states/episode{}_states.txt'.format( episode), mode='w') as myfile: for each in all_states: myfile.write(each) myfile.write('\n') myfile.close() break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) general_utilities.save_dqn_weights( dqns, "{}_{}_".format(weights_filename_prefix, episode)) if episode >= checkpoint_interval: os.remove("{}_{}.csv".format(csv_filename_prefix, episode - checkpoint_interval)) return statistics
def play(episodes, is_render, is_testing, checkpoint_interval, weights_filename_prefix, csv_filename_prefix, batch_size): # init statistics. NOTE: simple tag specific! statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)]) statistics_header.extend(["collisions_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) collision_count = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() # act actions = np.zeros(env.n) actions_onehot = [] action = dqn.choose_action(states[0]) speed = 0.9 #distribute actions to two players if action == 0: actions[0] = 0 actions[1] = 0 elif action == 1: actions[0] = 0 actions[1] = 1 elif action == 2: actions[0] = 0 actions[1] = 2 elif action == 3: actions[0] = 0 actions[1] = 3 elif action == 4: actions[0] = 0 actions[1] = 4 elif action == 5: actions[0] = 1 actions[1] = 0 elif action == 6: actions[0] = 1 actions[1] = 1 elif action == 7: actions[0] = 1 actions[1] = 2 elif action == 8: actions[0] = 1 actions[1] = 3 elif action == 9: actions[0] = 1 actions[1] = 4 elif action == 10: actions[0] = 2 actions[1] = 0 elif action == 11: actions[0] = 2 actions[1] = 1 elif action == 12: actions[0] = 2 actions[1] = 2 elif action == 13: actions[0] = 2 actions[1] = 3 elif action == 14: actions[0] = 2 actions[1] = 4 elif action == 15: actions[0] = 3 actions[1] = 0 elif action == 16: actions[0] = 3 actions[1] = 1 elif action == 17: actions[0] = 3 actions[1] = 2 elif action == 18: actions[0] = 3 actions[1] = 3 elif action == 19: actions[0] = 3 actions[1] = 4 elif action == 20: actions[0] = 4 actions[1] = 0 elif action == 21: actions[0] = 4 actions[1] = 1 elif action == 22: actions[0] = 4 actions[1] = 2 elif action == 23: actions[0] = 4 actions[1] = 3 elif action == 24: actions[0] = 4 actions[1] = 4 actions = actions.astype(int) for i in range(env.n): onehot_action = np.zeros(n_actions) onehot_action[actions[i]] = speed actions_onehot.append(onehot_action) # step states_next, rewards, done, info = env.step(actions_onehot) reward_cal = rewards[0] + rewards[1] # learn if not args.testing: size = memories.pointer batch = random.sample( range(size), size) if size < batch_size else random.sample( range(size), batch_size) done_cal = np.logical_and(done[0], done[1]) memories.remember(states[0], action, reward_cal, states_next[0], done_cal) if memories.pointer > batch_size * 10: history = dqn.learn(*memories.sample(batch)) episode_losses[0] += history.history["loss"][0] else: episode_losses[0] = -1 states = states_next episode_rewards += rewards collision_count += np.array( new_alg_utilities.count_agent_collisions(env)) # reset states if done if any(done): episode_rewards = episode_rewards / steps episode_losses = episode_losses / steps statistic = [episode] statistic.append(steps) statistic.extend([episode_rewards[i] for i in range(env.n)]) statistic.extend([episode_losses[i] for i in range(env.n)]) statistic.extend([dqn.eps_greedy for i in range(env.n)]) statistic.extend(collision_count.tolist()) statistics.add_statistics(statistic) if episode % 25 == 0: print(statistics.summarize_last()) break if episode % checkpoint_interval == 0: statistics.dump("{}_{}.csv".format(csv_filename_prefix, episode)) #general_utilities.save_dqn_weights(dqn, # "{}_{}_".format(weights_filename_prefix, episode)) #if episode >= checkpoint_interval: # os.remove("{}_{}.csv".format(csv_filename_prefix, # episode - checkpoint_interval)) return statistics
DQN(n_actions[i], state_sizes[i], eps_greedy=epsilon_greedy[i]) for i in range(env.n) ] general_utilities.load_dqn_weights_if_exist( dqns, args.experiment_prefix + args.weights_filename_prefix) start_time = time.time() statistics_header = ["episode"] statistics_header.append("steps") statistics_header.extend(["reward_{}".format(i) for i in range(env.n)]) statistics_header.extend(["loss_{}".format(i) for i in range(env.n)]) statistics_header.extend(["eps_greedy_{}".format(i) for i in range(env.n)]) print("Collecting statistics {}:".format(" ".join(statistics_header))) statistics = general_utilities.Time_Series_Statistics_Store( statistics_header) for episode in range(args.episodes): states = env.reset() episode_losses = np.zeros(env.n) episode_rewards = np.zeros(env.n) steps = 0 while True: steps += 1 # render if args.render: env.render() # act