def autoplay(method, environment, resume, render): game = Game(name=environments_to_names[environment], render=render) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, 1, 1, eps_start=0.0) agent.load(resume) log.info(f'Evaluating agent, loaded from {resume}, starting ...') game.reset() state = game.get_state() for t in count(): state = game.get_state() action = agent.select_action(state) transition, done = game.step(int(action.cpu().numpy())) # agent.eval( # transition, 1, 0.0) time.sleep(0.1) if done: log.info(f'agent survived {t} steps') game.reset() break game.env.close()
def teachAgent(self): iteration = 0 while iteration < self.numOfEpisodes: game = Game(self.agent) game.start(training=True) iteration += 1 if iteration % 10000 == 0: print("Training round: " + str(iteration)) plot_agent_reward(self.agent.rewards)
def resetGame(self, msg=None): wait_time = 3 self.game.waitScreen(msg1="Put Right Wrist on starting point.", msg2=msg, duration=wait_time) self.game = Game() self.action_human = 0.0 self.action_agent = 0.0 self.game.start_time = time.time() self.total_reward_per_game = 0 self.turns = 0
def play(environment): game = Game(name=environments_to_names[environment], render=True) done = False try: while not done: action = click.prompt('Please enter an action (0, 1, 2, 3..)') done = game.step(int(action)) log.info('[INFO] done ...') except KeyboardInterrupt: log.info("[INFO] quiting ...") exit()
def userPlayAgent(self): while True: game = Game(self.agent) game.start() playAgain = input("Would you like to play again? ('y', 'n'): ") if playAgain == 'n': print("See you later!") break print() print("Okay lets play again!") print()
class controller: def __init__(self): print("init") self.experiments_num = 10 # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate) self.game = Game() self.action_human1 = 0.0 self.action_human2 = 0.0 self.human_sub = rospy.Subscriber("/rl/action_x", action_human, self.set_action_human1) self.human_sub = rospy.Subscriber("/rl/action_y", action_human, self.set_action_human2) # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10) self.obs_robot_pub = rospy.Publisher('/rl/reward_observation_robot', reward_observation, queue_size=10, latch=True) self.transmit_time_list = [] def set_action_human1(self, act): self.action_human1 = act.action self.transmit_time_list.append(rospy.get_rostime().to_sec() - act.header.stamp.to_sec()) def set_action_human2(self, act): self.action_human2 = act.action self.transmit_time_list.append(rospy.get_rostime().to_sec() - act.header.stamp.to_sec()) def game_loop(self): total_time = [] # print self.action_human for exp in range(self.experiments_num): print("Experiment %d" % exp) while self.game.running: exec_time = self.game.play( [self.action_human1, self.action_human2]) total_time.append(exec_time) # reset game self.game = Game() self.game.start_time = time.time() self.game.endGame() print( "Average Execution time for play funtion is %f milliseconds. \n" % (mean(total_time) * 1e3)) print( "Average time from publishing to receiveing is %f milliseconds. \n" % (mean(self.transmit_time_list) * 1e3))
def resetGame(self, msg=None): """Resets the Turtle Game.""" wait_time = 3 # self.reset_robot_pub.publish(1) self.game.waitScreen(msg1="Put Right Wrist on starting point.", msg2=msg, duration=wait_time) self.game = Game() self.TIME = ACTION_DURATION * INTER_NUM self.action_human = 0.0 self.game.start_time = time.time() self.total_reward_per_game = 0 self.turns = 0
def __init__(self): print("init") self.experiments_num = 10 # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate) self.game = Game() self.action_human1 = 0.0 self.action_human2 = 0.0 self.human_sub = rospy.Subscriber("/rl/action_x", action_human, self.set_action_human1) self.human_sub = rospy.Subscriber("/rl/action_y", action_human, self.set_action_human2) # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10) self.obs_robot_pub = rospy.Publisher('/rl/reward_observation_robot', reward_observation, queue_size=10, latch=True) self.transmit_time_list = []
def game_loop(self): total_time = [] # print self.action_human for exp in range(self.experiments_num): print("Experiment %d" % (exp + 1)) while self.game.running: exec_time = self.game.play( [self.action_human, self.action_agent]) total_time.append(exec_time) self.publish_reward_and_observations() # self.game.endGame() # publish last reward and state self.publish_reward_and_observations() # reset game self.game = Game() self.game.start_time = time.time() self.game.endGame()
def run_simulator( n_games=1000, p1=Player(strategy="basic_q", learning=True), p2=Player(strategy="random", learning=False), save_Q=False, ): """Runs a number of tic tac toe games Arguments: n_games: number of tic tac toe games to be played p1: instance of the Player() class, should be used to specify the AI during training or testing p2: instance of the Player() class, should be used to specify the opponent save_Q: boolean, determines if p1's updated Q should be saved in a pickled file after training is complete Returns: metrics: dict of P1's wins, losses, and ties Use this function to run many games in a row. Depending on the parameters for P1 and P2, this could be used for training or testing. This function also specifies the epsilon (exploration factor) decay over time as the learner moves from high exploration to low. """ games = [Game(p1=p1, p2=p2) for i in range(n_games)] metrics = [] index = 0 starting_epsilon = p1._epsilon for game in games: index += 1 # run game and get results + P1's states and actions outcome, x_decisions, o_decisions = game.play_game() # log game results metrics.append(outcome) # update q learner after each game if p1._learning: p1.update_q(outcome, x_decisions) # reduce exploration factor after each game p1._epsilon = starting_epsilon - starting_epsilon * (1. * index / n_games)**2 print Counter(metrics) # save pickled Q learning file if save_Q: pickle.dump(p1._Q, open(p1.q_file, "wb")) return metrics
def __init__(self): print("init") self.experiments_num = 10 # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate) self.game = Game() self.action_human = 0.0 self.action_agent = 0.0 # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human) # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human) self.agent_sub = rospy.Subscriber("/rl/final_action", action_agent, self.set_action_agent) # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10) self.obs_robot_pub = rospy.Publisher('/rl/reward_and_observation_game', reward_observation, queue_size=10, latch=True) self.transmit_time_list = [] self.prev_state = self.game.getState() self.publish_reward_and_observations()
def game_loop(self): total_time = [] # print self.action_human for exp in range(self.experiments_num): print("Experiment %d" % exp) while self.game.running: exec_time = self.game.play( [self.action_human1, self.action_human2]) total_time.append(exec_time) # reset game self.game = Game() self.game.start_time = time.time() self.game.endGame() print( "Average Execution time for play funtion is %f milliseconds. \n" % (mean(total_time) * 1e3)) print( "Average time from publishing to receiveing is %f milliseconds. \n" % (mean(self.transmit_time_list) * 1e3))
def __init__(self): print("init") self.experiments_num = 10 # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate) self.game = Game() self.action_human = 0.0 self.action_agent = 0.0 # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human) # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human) self.agent_sub = rospy.Subscriber("/rl/total_action", action_agent, self.set_action_agent) # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10) self.obs_robot_pub = rospy.Publisher('/rl/game_response', reward_observation, queue_size=10, latch=True) self.transmit_time_list = [] self.rewards_list = [] self.turn_list = [] self.interaction_time_list = [] self.interaction_training_time_list = [] self.mean_list = [] self.stdev_list = [] self.alpha_values = [] self.policy_loss_list = [] self.value_loss_list = [] self.critics_lr_list = [] self.value_critic_lr_list = [] self.actor_lr_list = [] self.plot_directory = package_path + "/src/plots/" if not os.path.exists(self.plot_directory): print("Dir %s was not found. Creating it..." % (self.plot_directory)) os.makedirs(self.plot_directory)
def main(): env = Game(fps=fps, screensize=screen, state=state, playersize=player_size, playercolor=player_color, enemysize=enemy_size, enemycolor=enemy_color, squaresize=square_size, squarecolor=square_color) get_ann = { "clever": build_ann, "forged": forge_ann, "keras": keras_ann }.get(agent_type, lambda *args: None) # the agent name was already taken by the agent module :( actor = get_agent(env=env, get_network=get_ann) env.reset(actor) env.mainloop()
def adversarial_training( p1=Player(strategy="basic_q", learning=False, load_Q=True), p2=Player(strategy="random", learning=False), p3=Player(strategy="basic_q", learning=True, load_Q=True), p4=Player(strategy="adversarial", learning=False), ): """Finds strategies that beat the AI for targeted training Arguments: p1: trained Q-learner in "test mode" (no learning) p2: cpu player choosing random moves p3: trained Q-leaner that will continue to train p4: p3's opponent, uses saved strategy uncovered by p2 that beat p1 Plays a trained AI against a random player until (if) the random player wins. If the random player does win, it will save that strategy in its own Q file. It will then play this adversarial scenario many times so that the AI can learn a better strategy. Training is very specific and deep (many repetitions), so this is not meant to be a general training strategy and is best used after the AI is already sufficiently robust. Each time this function runs will only cover one adversarial example, so it may need to be run many times. """ # phase 1: AI vs. random opponent max_games = 50000 games = [Game(p1=p1, p2=p2) for i in range(max_games)] for game in games: # run game and get results + P1's states and actions outcome, x_decisions, o_decisions = game.play_game() # build a Q network for player O only where X lost if outcome == "lost": # phase 2: adversarial training print "AI lost. Playing adversarial games..." Q_adversarial = {board: action for board, action in o_decisions} num_games = 10000 a_games = [Game( p1=p3, p2=p4, ) for i in range(num_games)] starting_epsilon = p3._epsilon a_index = 0 a_metrics = [] for game in a_games: p4._Q = Q_adversarial a_index += 1 outcome, x_decisions, o_decisions = game.play_game() a_metrics.append(outcome) p3.update_q(outcome, x_decisions) p3._epsilon = starting_epsilon - starting_epsilon * ( 1. * a_index / num_games)**2 print "Adversarial game outcomes:\n" print Counter(a_metrics) print "Building better, stronger Q..." pickle.dump(p3._Q, open(p3.q_file, "wb")) # end training return print "played %d games without losing!" % max_games
def human_vs_human(): Game(p1=Player(strategy="human"), p2=Player(strategy="human"), verbose=True).play_game()
def new_game(self): return Game(self.action_space_size, self.discount)
# r,g,b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2] # return 0.2989 * r + 0.5870 * g + 0.1140 * b # def shapeState(img): # # resize image and make grayscale # resized_gray = rgb2gray(scipy.misc.imresize(img,(64,64)))/ 255.0 # shaped = resized_gray.reshape(1,64,64,1) # return shaped def shapeState(img): return img.reshape(1, 64, 64, 1) # env = gym.make('CartPole-v1') env = Game() next_state, reward, done = env.render() # feed 64 by 64 grayscale images into CNN state_size = (64, 64) action_size = len(env.keylist) agent = Player(state_size, action_size) done = False batch_size = 32 max_score = 0 frames = [] best = [] print('get game window in focus') for i in list(range(4))[::-1]:
def __init__(self): # print("init") self.game = Game() # initialize a turtle game instance self.agent = SAC() # initialize a soft actor critic agent """ initialize human & agent actions """ self.action_human = 0.0 self.action_agent = 0.0 """ Create subscribers for human action act_human_sub_y is for the case that the agent's action is not used """ self.act_human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_human_action) self.act_human_sub_y = rospy.Subscriber("/rl/action_y", action_msg, self.set_human_action_y) """ Create publishers for turtle's acceleration, agent's action, robot reset signal and turtle's position on x axis """ self.turtle_accel_pub = rospy.Publisher("/rl/turtle_accel", Float32, queue_size=10) self.act_agent_pub = rospy.Publisher("/rl/action_y", action_msg, queue_size=10) self.reset_robot_pub = rospy.Publisher("/robot_reset", Int16, queue_size=1) self.turtle_state_pub = rospy.Publisher("/rl/turtle_pos_X", Int16, queue_size=10) """ Initialize statistics lists """ self.transmit_time_list = [] self.agent_act_list = [] self.human_act_list = [] self.action_timesteps = [] self.exec_time_list = [] self.act_human_list = [] self.real_act_list = [] self.time_real_act_list = [] self.time_turtle_pos = [] self.time_turtle_vel = [] self.time_turtle_acc = [] self.exec_time = None self.action_human_timestamp = 0 self.human_action_delay_list = [] self.used_human_act_list = [] self.used_human_act_time_list = [] self.position_timesteps = [] self.fps_list = [] self.action_human_y = 0 self.human_act_list_total = [] self.agent_act_list_total = [] self.interaction_counter = 0 self.iter_num = 0 self.run_num = 1 """ Initialize RL lists """ self.rewards_list = [] self.turn_list = [] self.interaction_time_list = [] self.interaction_training_time_list = [] self.mean_list = [] self.stdev_list = [] self.alpha_values = [] self.policy_loss_list = [] self.value_loss_list = [] self.q_loss_list = [] self.critics_lr_list = [] self.value_critic_lr_list = [] self.actor_lr_list = [] self.trials_list = [] """ Initialize turtle's monitoring lists """ self.turtle_pos_x = [] self.turtle_vel_x = [] self.turtle_acc_x = [] self.turtle_pos_y = [] self.turtle_vel_y = [] self.turtle_acc_y = [] self.turtle_pos_x_dict = {} """ Initialize Paths & Dirs """ rospack = rospkg.RosPack() package_path = rospack.get_path("collaborative_games") self.plot_directory = package_path + "/src/plots/" + control_mode + "/" if not os.path.exists(self.plot_directory): print("Dir %s was not found. Creating it..." % (self.plot_directory)) os.makedirs(self.plot_directory) exp_num = 1 while 1: if os.path.exists(self.plot_directory + "Experiment_" + str(exp_num) + "/"): exp_num += 1 else: self.plot_directory += "Experiment_" + str(exp_num) + "/" os.makedirs(self.plot_directory) break if not os.path.exists(self.plot_directory + 'rl_dynamics/'): print("Dir %s was not found. Creating it..." % (self.plot_directory + 'rl_dynamics/')) os.makedirs(self.plot_directory + 'rl_dynamics/')
def main(): game = Game() env_size = 4 m = MultiAgentCNN("pursuers", env_size) # target_m = MultiAgentCNN("target") ml_simulator = TfSimulator(4, m) #, target_m) epochs = 100 sim_per_epoch = 20 steps_per_run = 25 memory_size = 20000 subsample = 2 epsilon = 1.2 base_cap = 1 saver = 1000 run_dir = '{}_{}_{}_{}'.format(time(), epochs, sim_per_epoch, steps_per_run) print('Saving to {}'.format(run_dir)) #os.mkdir('animations/'+run_dir) decay = (1 - 6e-5) decayer = 1. loss_history = [] memory_actions = [] memory_states = [] memory_rewards = [] memory_end_states = [] total_steps_performed = 0 # Run epochs until 2.5M examples have been visited for radius in range(1, 4): for ep in range(epochs): decayer = 1. if total_steps_performed > 4000000: break states = [] rewards = [] actions = [] end_states = [] for sim in range(sim_per_epoch): # print("\tRun {}".format(sim)) n_bots = (sim % 3 + 2) % len(ml_simulator.env.bots) # Initialize randomly target pos = ml_simulator.env.rand_position() ml_simulator.env.rand_initialise_within_radius( radius, n_bots, pos, bot_on_radius=True) decayer *= decay base_cap0 = base_cap * decayer if len(memory_rewards) > memory_size / 2: epsilon = epsilon * decay # Baseline vs Model #if np.random.uniform(0, 1) < : if np.random.uniform(0, 1) < base_cap0: s, r, a, e_s = game.get_simulation_history( bot=0, N=steps_per_run, subsample=subsample) else: s, r, a, e_s = ml_simulator.run_simulation( 0, steps_per_run, subsample=subsample, epsilon=epsilon) if s is None or r is None or a is None: continue s = np.asarray(s) r = np.asarray(r) a = np.asarray(a) e_s = np.asarray(e_s) # Soft training try: statesTrain = np.concatenate(s, axis=0) rewardsTrain = np.concatenate(r, axis=0) actionsTrain = np.concatenate(a, axis=0) endStateTrain = np.concatenate(e_s, axis=0) if statesTrain.shape[1:] != (env_size, env_size, 3): statesTrain = statesTrain.transpose((0, 2, 3, 1)) if endStateTrain.shape[1:] != (env_size, env_size, 3): endStateTrain = endStateTrain.transpose((0, 2, 3, 1)) ml_simulator.model.train(statesTrain, rewardsTrain, actionsTrain, endStateTrain) # Expand memory DB rewards += [s] actions += [r] states += [s] end_states += [e_s] except Exception as e: # print("Error:", e) continue try: _ = np.concatenate(states, axis=0) _ = np.concatenate(rewards, axis=0) _ = np.concatenate(actions, axis=0) except Exception as e: print("Concat error:", e) continue n = len(rewards) total_steps_performed += n if len(memory_rewards) > memory_size: memory_rewards = memory_rewards[n * 2:] + rewards memory_actions = memory_actions[n * 2:] + actions memory_states = memory_states[n * 2:] + states memory_end_states = memory_end_states[n * 2:] + end_states else: memory_rewards = memory_rewards + rewards memory_actions = memory_actions + actions memory_states = memory_states + states memory_end_states = memory_end_states + end_states print("\t~ Building memory DB", len(memory_rewards)) try: statesTrain = np.concatenate(memory_states, axis=0) rewardsTrain = np.concatenate(memory_rewards, axis=0) actionsTrain = np.concatenate(memory_actions, axis=0) endStateTrain = np.concatenate(memory_end_states, axis=0) except Exception as e: print("Concat error:", e) continue env_shape = ml_simulator.env.verticeMatrix.shape while statesTrain.shape[1] != env_shape[0] or statesTrain.shape[ 2] != env_shape[1] or statesTrain.shape[3] != 3: statesTrain = statesTrain.transpose((0, 2, 3, 1)) while endStateTrain.shape[1] != env_shape[0] or endStateTrain.shape[ 2] != env_shape[1] or endStateTrain.shape[3] != 3: endStateTrain = endStateTrain.transpose((0, 2, 3, 1)) # Hard training save = (total_steps_performed > saver) if save: saver += 2500 print("\t- Memory Training...") loss = ml_simulator.model.train(statesTrain, rewardsTrain, actionsTrain, endStateTrain, save=save) print('\tLoss: {}'.format(loss)) if save: ml_simulator.run_and_plot(run_dir, ep, radius) loss_history.append(loss) m.save() ml_simulator.run_and_plot(run_dir, 'FINAL') plt.plot(np.arange(epochs), loss_history) plt.title("Loss over epochs") plt.xlabel("epoch") plt.ylabel("loss") plt.show()
def main(): game = Game() env_size = 4 m = MultiAgentCNN("pursuers", env_size) # target_m = MultiAgentCNN("target") ml_simulator = TfSimulator(4, m) #, target_m) eps_per_radius = 50 sim_per_epoch = 25 steps_per_run = 15 radius_top = 25 memory_size = 10000 subsample = 2 epsilon = 1. base_cap = 0.0 save_top = 2500 total_steps_performed = 0 run_dir = '{}_{}_{}'.format(time(), sim_per_epoch, steps_per_run) print('Saving to {}'.format(run_dir)) os.mkdir('animations/' + run_dir) decay = (1 - 5e-5) loss_history = [] memory_actions = [] memory_states = [] memory_rewards = [] memory_end_states = [] for radius in range(1, radius_top): for ep in range(eps_per_radius): print("EPOCH {} ---- Radius {}".format(ep, radius)) states = [] rewards = [] actions = [] end_states = [] for sim in range(sim_per_epoch): n_bots = (sim % (len(ml_simulator.env.bots) - 1) + 1) pos = ml_simulator.env.rand_position() ml_simulator.env.rand_initialise_within_radius( radius, n_bots, pos, bot_on_radius=True) base_cap *= decay if len(memory_rewards) > memory_size * 0.9: epsilon *= decay if np.random.uniform(0, 1) < base_cap: s, r, a, e_s = game.get_simulation_history(bot=0, N=steps_per_run, subsample=0) else: s, r, a, e_s = ml_simulator.run_simulation( 0, steps_per_run, subsample=subsample, epsilon=epsilon) if s is None or r is None or a is None: continue s = np.asarray(s) r = np.asarray(r) a = np.asarray(a) e_s = np.asarray(e_s) if s.shape[1:] != (env_size, env_size, 3): s = s.transpose((0, 2, 3, 1)) if e_s.shape[1:] != (env_size, env_size, 3): e_s = e_s.transpose((0, 2, 3, 1)) states.append(s) rewards.append(r) actions.append(a) end_states.append(e_s) # Soft train try: s_train = np.concatenate(states, axis=0) r_train = np.concatenate(rewards, axis=0) a_train = np.concatenate(actions, axis=0) e_s_train = np.concatenate(end_states, axis=0) except Exception as e: print("Soft", e) continue ml_simulator.model.train(s_train, r_train, a_train, e_s_train) n = len(r_train) total_steps_performed += n if len(memory_rewards) > memory_size: memory_rewards = memory_rewards[n * 2:] + rewards memory_actions = memory_actions[n * 2:] + actions memory_states = memory_states[n * 2:] + states memory_end_states = memory_end_states[n * 2:] + end_states else: memory_rewards = memory_rewards + rewards memory_actions = memory_actions + actions memory_states = memory_states + states memory_end_states = memory_end_states + end_states print("\t~ Building memory DB", len(memory_rewards)) # Hard Training try: statesTrain = np.concatenate(memory_states, axis=0) rewardsTrain = np.concatenate(memory_rewards, axis=0) actionsTrain = np.concatenate(memory_actions, axis=0) endStateTrain = np.concatenate(memory_end_states, axis=0) except Exception as e: print("Concat error:", e) continue env_shape = ml_simulator.env.verticeMatrix.shape while statesTrain.shape[1] != env_shape[0] or statesTrain.shape[ 2] != env_shape[1] or statesTrain.shape[3] != 3: statesTrain = statesTrain.transpose((0, 2, 3, 1)) while endStateTrain.shape[1] != env_shape[0] or endStateTrain.shape[2] != env_shape[1] or \ endStateTrain.shape[ 3] != 3: endStateTrain = endStateTrain.transpose((0, 2, 3, 1)) save = (total_steps_performed > save_top) print("\t- Memory Training...") loss = ml_simulator.model.train(statesTrain, rewardsTrain, actionsTrain, endStateTrain, save=save) print('\tLoss: {}'.format(loss)) loss_history.append(loss) if save: save_top += 2500 plot_history(loss_history) if save: ml_simulator.run_and_plot(run_dir, radius, radius) m.save() ml_simulator.run_and_plot(run_dir, 'FINAL') plot_history(loss_history)
def test_learner_against_human(): # tests Q-learner against human player ("O") p1 = Player(strategy="basic_q", load_Q=True) p2 = Player(strategy="human") game = Game(p1=p1, p2=p2, verbose=True) game.play_game()
class controller: def __init__(self): # print("init") self.game = Game() # initialize a turtle game instance self.agent = SAC() # initialize a soft actor critic agent """ initialize human & agent actions """ self.action_human = 0.0 self.action_agent = 0.0 """ Create subscribers for human action act_human_sub_y is for the case that the agent's action is not used """ self.act_human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_human_action) self.act_human_sub_y = rospy.Subscriber("/rl/action_y", action_msg, self.set_human_action_y) """ Create publishers for turtle's acceleration, agent's action, robot reset signal and turtle's position on x axis """ self.turtle_accel_pub = rospy.Publisher("/rl/turtle_accel", Float32, queue_size=10) self.act_agent_pub = rospy.Publisher("/rl/action_y", action_msg, queue_size=10) self.reset_robot_pub = rospy.Publisher("/robot_reset", Int16, queue_size=1) self.turtle_state_pub = rospy.Publisher("/rl/turtle_pos_X", Int16, queue_size=10) """ Initialize statistics lists """ self.transmit_time_list = [] self.agent_act_list = [] self.human_act_list = [] self.action_timesteps = [] self.exec_time_list = [] self.act_human_list = [] self.real_act_list = [] self.time_real_act_list = [] self.time_turtle_pos = [] self.time_turtle_vel = [] self.time_turtle_acc = [] self.exec_time = None self.action_human_timestamp = 0 self.human_action_delay_list = [] self.used_human_act_list = [] self.used_human_act_time_list = [] self.position_timesteps = [] self.fps_list = [] self.action_human_y = 0 self.human_act_list_total = [] self.agent_act_list_total = [] self.interaction_counter = 0 self.iter_num = 0 self.run_num = 1 """ Initialize RL lists """ self.rewards_list = [] self.turn_list = [] self.interaction_time_list = [] self.interaction_training_time_list = [] self.mean_list = [] self.stdev_list = [] self.alpha_values = [] self.policy_loss_list = [] self.value_loss_list = [] self.q_loss_list = [] self.critics_lr_list = [] self.value_critic_lr_list = [] self.actor_lr_list = [] self.trials_list = [] """ Initialize turtle's monitoring lists """ self.turtle_pos_x = [] self.turtle_vel_x = [] self.turtle_acc_x = [] self.turtle_pos_y = [] self.turtle_vel_y = [] self.turtle_acc_y = [] self.turtle_pos_x_dict = {} """ Initialize Paths & Dirs """ rospack = rospkg.RosPack() package_path = rospack.get_path("collaborative_games") self.plot_directory = package_path + "/src/plots/" + control_mode + "/" if not os.path.exists(self.plot_directory): print("Dir %s was not found. Creating it..." % (self.plot_directory)) os.makedirs(self.plot_directory) exp_num = 1 while 1: if os.path.exists(self.plot_directory + "Experiment_" + str(exp_num) + "/"): exp_num += 1 else: self.plot_directory += "Experiment_" + str(exp_num) + "/" os.makedirs(self.plot_directory) break if not os.path.exists(self.plot_directory + 'rl_dynamics/'): print("Dir %s was not found. Creating it..." % (self.plot_directory + 'rl_dynamics/')) os.makedirs(self.plot_directory + 'rl_dynamics/') # if not os.path.exists(self.plot_directory + "turtle_dynamics/"): # print("Dir %s was not found. Creating it..." %(self.plot_directory + 'turtle_dynamics/')) # os.makedirs(self.plot_directory + 'turtle_dynamics/') def set_human_action(self, action_human): """ Gets the human action from the publisher. """ if action_human.action != 0.0: self.action_human = action_human.action self.action_human_timestamp = action_human.header.stamp.to_sec() self.time_real_act_list.append(self.action_human_timestamp) self.real_act_list.append(self.action_human) self.transmit_time_list.append(rospy.get_rostime().to_sec() - action_human.header.stamp.to_sec()) def set_human_action_y(self, action_human): """ Gets the human action from the publisher. """ if action_human.action != 0.0: self.action_human_y = action_human.action def game_loop(self): """ The main loop of the Collaborative Game. This loop contains the training and the testing for the collaborative game experiment """ first_update = True global_time = rospy.get_rostime().to_sec() # Run a testing session before starting training self.reset_lists() # score_list = self.test() # self.mean_list.append(mean(score_list)) # self.stdev_list.append(stdev(score_list)) # self.trials_list.append(score_list) # Run trials self.resetGame() self.interaction_counter = 0 for exp in range(MAX_STEPS + 1): self.check_goal_reached() if self.game.running: # checks if the game has not finished yet self.interaction_counter += 1 start_interaction_time = time.time() self.game.experiment = exp self.turns += 1 state = self.getState() # checks if agent or human controls the y axis if use_agent: agent_act = self.agent.next_action(state) else: agent_act = self.action_human_y self.turtle_state_pub.publish(state[2]) self.publish_agent_action(agent_act) tmp_time = time.time() act_human = self.action_human # self.action_human is changing while the loop is running human_act_timestamp = self.action_human_timestamp self.used_human_act_list.append(act_human) self.used_human_act_time_list.append(tmp_time) """This loop ensures that an action will be excexuted for the ACTION_DURATION""" count = 0 while ((time.time() - tmp_time) < action_duration) and self.game.running: count += 1 # control mode is "accel" or "vel" self.exec_time = self.game.play( [act_human, agent_act.item()], control_mode=control_mode) self.human_action_delay_list.append(time.time() - human_act_timestamp) # exec_time = self.game.play([act_human, 0],control_mode=control_mode) self.agent_act_list.append(agent_act.item()) self.human_act_list.append(act_human) self.action_timesteps.append(tmp_time) self.save_stats() self.check_goal_reached() # print count """ Retrive RLself.information.""" reward = self.getReward() next_state = self.getState() done = self.game.finished episode = [state, reward, agent_act, next_state, done] self.agent.update_rw_state(episode) self.total_reward_per_game += reward self.interaction_time_list.append(time.time() - start_interaction_time) # when replay buffer has enough samples update gradient at every turn if first_update and len( self.agent.D) >= BATCH_SIZE and exp > UPDATE_START: if first_update: print("\nStarting updates") first_update = False if use_agent: [ alpha, policy_loss, value_loss, q_loss, critics_lr, value_critic_lr, actor_lr ] = self.agent.train(sample=episode) self.save_rl_data(alpha, policy_loss, value_loss, q_loss, critics_lr, value_critic_lr, actor_lr) self.interaction_training_time_list.append( time.time() - start_interaction_time) # run "offline_updates_num" offline gradient updates every "UPDATE_INTERVAL" steps elif not first_update and len( self.agent.D ) >= BATCH_SIZE and exp % UPDATE_INTERVAL == 0: print(str(offline_updates_num) + " Gradient upadates") self.game.waitScreen("Training... Please Wait.") pbar = tqdm(xrange(1, offline_updates_num + 1), unit_scale=1, smoothing=0) for _ in pbar: if use_agent: [ alpha, policy_loss, value_loss, q_loss, critics_lr, value_critic_lr, actor_lr ] = self.agent.train(verbose=False) self.interaction_training_time_list.append( time.time() - start_interaction_time) self.save_rl_data(alpha, policy_loss, value_loss, q_loss, critics_lr, value_critic_lr, actor_lr) # # run testing trials self.reset_lists() score_list = self.test() self.mean_list.append(mean(score_list)) self.stdev_list.append(stdev(score_list)) self.trials_list.append(score_list) self.resetGame() else: #the game has finished self.save_and_plot_stats_environment(self.run_num) self.run_num += 1 self.turn_list.append(self.turns) self.rewards_list.append(self.total_reward_per_game) # reset game self.resetGame() self.save_and_plot_stats_rl() print( "Average Human Action Transmission time per interaction: %f milliseconds(stdev: %f). \n" % (mean(self.transmit_time_list) * 1e3, stdev(self.transmit_time_list) * 1e3)) print( "Average Execution time per interaction: %f milliseconds(stdev: %f). \n" % (mean(self.interaction_time_list) * 1e3, stdev(self.interaction_time_list) * 1e3)) if use_agent: print( "Average Execution time per interaction and online update: %f milliseconds(stdev: %f). \n" % (mean(self.interaction_training_time_list) * 1e3, stdev(self.interaction_training_time_list) * 1e3)) print("Total time of experiments is: %d minutes and %d seconds.\n" % ((rospy.get_rostime().to_sec() - global_time) / 60, (rospy.get_rostime().to_sec() - global_time) % 60)) self.game.endGame() def test(self): """ Performs the testing. The average value of the policy is used as the action and no sampling is being performed.""" score_list = [] self.iter_num += 1 for game in range(test_num): score = INTER_NUM self.resetGame("Testing Model. Trial %d of %d." % (game + 1, test_num)) self.interaction_counter = 0 while self.game.running: self.interaction_counter += 1 self.game.experiment = "Test: " + str(game + 1) state = self.getState() if use_agent: agent_act = self.agent.next_action( state, stochastic=False) # take only the mean else: agent_act = self.action_human_y act_human = self.action_human tmp_time = time.time() self.agent_act_list.append(agent_act.item()) self.human_act_list.append(act_human) self.action_timesteps.append(tmp_time) # print(agent_act) self.publish_agent_action(agent_act) while ((time.time() - tmp_time) < action_duration) and self.game.running: self.exec_time = self.game.play( [act_human, agent_act.item()], total_games=test_num, control_mode=control_mode) self.save_stats() self.check_goal_reached() score -= 1 self.check_goal_reached() score_list.append(score) self.save_and_plot_stats_environment("Test_" + str(self.iter_num) + "_" + str(game)) return score_list def save_stats(self): """Saves all the turtle-potition-relevant statistics for later analysis.""" self.turtle_accel_pub.publish(self.game.accel_x) self.turtle_pos_x.append(self.game.real_turtle_pos[0]) self.turtle_pos_y.append(self.game.real_turtle_pos[1]) self.time_turtle_pos.append(rospy.get_rostime().to_sec()) self.turtle_vel_x.append(self.game.vel_x) self.turtle_vel_y.append(self.game.vel_y) self.time_turtle_vel.append(rospy.get_rostime().to_sec()) self.turtle_acc_x.append(self.game.accel_x) self.turtle_acc_y.append(self.game.accel_y) self.time_turtle_acc.append(rospy.get_rostime().to_sec()) self.fps_list.append(self.game.current_fps) self.turtle_pos_x_dict[self.game.turtle_pos[0]] = self.exec_time self.exec_time_list.append(self.exec_time) def getReward(self): """Calculates the reward for the SAC agent""" if self.game.finished: if self.game.timedOut and REWARD_TYPE == "penalty": return -50 else: return 100 else: return -1 def getState(self): """Returns the state of the turtle game.""" return [ self.game.accel_x, self.game.accel_y, self.game.turtle_pos[0], self.game.turtle_pos[1], self.game.vel_x, self.game.vel_y ] def check_goal_reached(self, name="turtle"): """Checks if the goal of the turtle game has been reached.""" if name is "turtle": if self.game.time_dependend and ( self.interaction_counter % INTER_NUM == 0) and self.interaction_counter != 0: self.game.running = 0 self.game.exitcode = 1 self.game.timedOut = True self.game.finished = True self.interaction_counter = 0 # if self.game.width - 40 > self.game.turtle_pos[0] > self.game.width - (80 + 40) \ # and 20 < self.game.turtle_pos[1] < (80 + 60 / 2 - 32): # if (self.game.width - 160) <= self.game.turtle_pos[0] <= (self.game.width - 60 - 64) and 40 <= self.game.turtle_pos[1] <= (140 - 64): # self.game.running = 0 # self.game.exitcode = 1 # self.game.finished = True # This means final state achieved # self.interaction_counter = 0 if (self.game.width - 160) <= self.game.turtle_pos[0] + 32 <= ( self.game.width - 60) and 40 <= self.game.turtle_pos[1] + 32 <= 140: self.game.running = 0 self.game.exitcode = 1 self.game.finished = True # This means final state achieved self.interaction_counter = 0 def resetGame(self, msg=None): """Resets the Turtle Game.""" wait_time = 3 # self.reset_robot_pub.publish(1) self.game.waitScreen(msg1="Put Right Wrist on starting point.", msg2=msg, duration=wait_time) self.game = Game() self.TIME = ACTION_DURATION * INTER_NUM self.action_human = 0.0 self.game.start_time = time.time() self.total_reward_per_game = 0 self.turns = 0 # self.reset_robot_pub.publish(1) def save_and_plot_stats_rl(self): """Saves & plots all the rl-relevant statistics for later analysis.""" np.savetxt(self.plot_directory + 'rl_dynamics/' + 'alpha_values.csv', self.alpha_values, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'policy_loss.csv', self.policy_loss_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'value_loss.csv', self.value_loss_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'q_loss.csv', self.q_loss_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'rewards_list.csv', self.rewards_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'turn_list.csv', self.turn_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'means.csv', self.mean_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'stdev.csv', self.stdev_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'critics_lr_list.csv', self.critics_lr_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'value_critic_lr_list.csv', self.value_critic_lr_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'actor_lr_list.csv', self.actor_lr_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rl_dynamics/' + 'trials_list.csv', self.trials_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'agent_act_list_total.csv', self.agent_act_list_total, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'human_act_list_total.csv', self.human_act_list_total, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'human_action_delay_list.csv', self.human_action_delay_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'fps_list.csv', self.fps_list, delimiter=',', fmt='%f') plot_hist(self.fps_list, self.plot_directory, 'fps_list_' + str(self.game.fps)) np.savetxt(self.plot_directory + 'exec_time_list.csv', self.exec_time_list, delimiter=',', fmt='%f') plot(range(len(self.alpha_values)), self.alpha_values, "alpha_values", 'Alpha Value', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.policy_loss_list)), self.policy_loss_list, "policy_loss", 'Policy loss', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.value_loss_list)), self.value_loss_list, "value_loss_list", 'Value loss', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.rewards_list)), self.rewards_list, "Rewards_per_game", 'Total Rewards per Game', 'Number of Games', self.plot_directory, save=True) plot(range(len(self.turn_list)), self.turn_list, "Steps_per_game", 'Turns per Game', 'Number of Games', self.plot_directory, save=True) plot(range(len(self.critics_lr_list)), self.critics_lr_list, "critics_lr_list", 'Critic lr', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.value_critic_lr_list)), self.value_critic_lr_list, "value_critic_lr_list", 'Value lr', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.actor_lr_list)), self.actor_lr_list, "actor_lr_list", 'Actor lr', 'Number of Gradient Updates', self.plot_directory, save=True) try: plot(range(UPDATE_INTERVAL, MAX_STEPS + UPDATE_INTERVAL, UPDATE_INTERVAL), self.mean_list, "trials", 'Tests Score', 'Number of Interactions', self.plot_directory, save=True, variance=True, stdev=self.stdev_list) except: print("Trials did not ploted") plot_hist(self.agent_act_list_total, self.plot_directory + 'agent_act_hist_total', 'Agent Action Histogram') plot_hist(self.human_act_list_total, self.plot_directory + 'human_act_hist_total', 'Human Action Histogram') # human_action_delay_list_new = [elem for elem in self.human_action_delay_list if elem <0.8] # remove outliers caused by saving and plotting functions plot_hist(self.human_action_delay_list, self.plot_directory + 'human_action_delay_list', 'Human Action Delay Histogram') def save_and_plot_stats_environment(self, run_num): """Saves all environment-relevant statistics for later analysis.""" run_subfolder = "game_" + str(run_num) + "/" os.makedirs(self.plot_directory + run_subfolder + "turtle_dynamics/") self.exp_details() np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'agent_act_list.csv', self.agent_act_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'human_act_list.csv', self.human_act_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'action_timesteps.csv', self.action_timesteps, delimiter=',', fmt='%f') human_act_list = np.genfromtxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'human_act_list.csv', delimiter=',') agent_act_list = np.genfromtxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'agent_act_list.csv', delimiter=',') plot_hist( agent_act_list, self.plot_directory + run_subfolder + "turtle_dynamics/" + 'agent_act_hist_' "game_" + str(run_num), 'Agent Action Histogram') plot_hist( human_act_list, self.plot_directory + run_subfolder + "turtle_dynamics/" + 'human_act_hist_' "game_" + str(run_num), 'Human Action Histogram') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'turtle_pos_x.csv', self.turtle_pos_x, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'turtle_pos_y.csv', self.turtle_pos_y, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'time_turtle_pos.csv', self.time_turtle_pos, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'turtle_vel_x.csv', self.turtle_vel_x, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'turtle_vel_y.csv', self.turtle_vel_y, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'time_turtle_vel.csv', self.time_turtle_vel, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'turtle_accel_x.csv', self.turtle_acc_x, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'turtle_accel_y.csv', self.turtle_acc_y, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'time_turtle_acc.csv', self.time_turtle_acc, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'real_act_list.csv', self.real_act_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'time_real_act_list.csv', self.time_real_act_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'used_human_act_list.csv', self.used_human_act_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" + 'used_human_act_time_list.csv', self.used_human_act_time_list, delimiter=',', fmt='%f') subplot(self.plot_directory + run_subfolder + "turtle_dynamics/", self.turtle_pos_x, self.turtle_vel_x, self.turtle_acc_x, self.time_turtle_pos, self.time_turtle_vel, self.time_turtle_acc, human_act_list, self.action_timesteps, "x", control_mode) subplot(self.plot_directory + run_subfolder + "turtle_dynamics/", self.turtle_pos_y, self.turtle_vel_y, self.turtle_acc_y, self.time_turtle_pos, self.time_turtle_vel, self.time_turtle_acc, agent_act_list, self.action_timesteps, "y", control_mode) plt.figure("Real_Human_Actions_Comparison", figsize=(25, 10)) plt.grid() plt.ylabel('Human Actions') plt.xlabel('Msg Timestamp(seconds)') # plt.xticks(plt.xticks(np.arange(min(min(time_real_act_list),min(self.used_human_act_time_list)), max(max(time_real_act_list),max(used_human_act_time_list)), 1))) plt.scatter(self.time_real_act_list, self.real_act_list) plt.scatter(self.used_human_act_time_list, self.used_human_act_list) plt.savefig(self.plot_directory + run_subfolder + "human_real_action_comparison", dpi=150) self.reset_lists() def publish_agent_action(self, agent_act): """Publishes Agent's action.""" h = std_msgs.msg.Header() h.stamp = rospy.Time.now() act = action_msg() act.action = agent_act act.header = h self.act_agent_pub.publish(act) def reset_lists(self): """Reseting funstion for the list.""" self.human_act_list_total.extend(self.human_act_list) self.agent_act_list_total.extend(self.agent_act_list) self.agent_act_list = [] self.human_act_list = [] self.action_timesteps = [] self.turtle_pos_x = [] self.turtle_pos_y = [] self.time_turtle_pos = [] self.turtle_vel_x = [] self.turtle_vel_y = [] self.time_turtle_vel = [] self.turtle_acc_x = [] self.turtle_acc_y = [] self.time_turtle_acc = [] self.real_act_list = [] self.time_real_act_list = [] self.used_human_act_list = [] self.used_human_act_time_list = [] def save_rl_data(self, alpha, policy_loss, value_loss, q_loss, critics_lr, value_critic_lr, actor_lr): """RL data saver.""" self.alpha_values.append(alpha.item()) self.policy_loss_list.append(policy_loss.item()) self.value_loss_list.append(value_loss.item()) self.q_loss_list.append(q_loss.item()) self.critics_lr_list.append(critics_lr) self.value_critic_lr_list.append(value_critic_lr) self.actor_lr_list.append(actor_lr) def exp_details(self): """Keeps track of the experiment details & saves them for later reference at the experiment.""" exp_details = {} exp_details["MAX_STEPS"] = MAX_STEPS exp_details["CONTROL_MODE"] = CONTROL_MODE exp_details["ACCEL_RATE"] = ACCEL_RATE exp_details["ACTION_DURATION"] = ACTION_DURATION exp_details["BATCH_SIZE"] = BATCH_SIZE exp_details["REPLAY_SIZE"] = REPLAY_SIZE exp_details["OFFLINE_UPDATES"] = OFFLINE_UPDATES exp_details["UPDATE_START"] = UPDATE_START csv_file = "exp_details.csv" try: w = csv.writer(open(self.plot_directory + csv_file, "w")) for key, val in exp_details.items(): w.writerow([key, val]) except IOError: print("I/O error")
class controller: def __init__(self): print("init") self.experiments_num = 10 # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate) self.game = Game() self.action_human = 0.0 self.action_agent = 0.0 # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human) # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human) self.agent_sub = rospy.Subscriber("/rl/final_action", action_agent, self.set_action_agent) # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10) self.obs_robot_pub = rospy.Publisher('/rl/reward_and_observation_game', reward_observation, queue_size=10, latch=True) self.transmit_time_list = [] self.prev_state = self.game.getState() self.publish_reward_and_observations() # def set_action_human(self,action_human): # self.action_human = action_human.action # self.transmit_time_list.append(rospy.get_rostime().to_sec() - action_human.header.stamp.to_sec()) def set_action_agent(self, action_agent): self.prev_state = self.game.getState() self.action_agent = action_agent.action[1] self.action_human = action_agent.action[0] self.transmit_time_list.append(rospy.get_rostime().to_sec() - action_agent.header.stamp.to_sec()) def publish_reward_and_observations(self): h = std_msgs.msg.Header() h.stamp = rospy.Time.now() new_obs = reward_observation() new_obs.header = h new_obs.observations = self.game.getState() new_obs.prev_state = self.prev_state new_obs.final_state = self.game.finished new_obs.reward = self.game.getReward() self.obs_robot_pub.publish(new_obs) def game_loop(self): total_time = [] # print self.action_human for exp in range(self.experiments_num): print("Experiment %d" % (exp + 1)) while self.game.running: exec_time = self.game.play( [self.action_human, self.action_agent]) total_time.append(exec_time) self.publish_reward_and_observations() # self.game.endGame() # publish last reward and state self.publish_reward_and_observations() # reset game self.game = Game() self.game.start_time = time.time() self.game.endGame()
def train(method, environment, resume, episodes, lr, lr_episodes, min_lr, eval_only, replay_width, batch_size, gamma, update_rate, save_interval): history = History(method + '_' + environment, ['steps', 'avg_reward', 'loss'], resume is not None) history.flush() memory = ReplayMemory(replay_width) game = Game(name=environments_to_names[environment], memory=memory, render=False) init_state, state_shape = game.get_state(True) n_actions = game.env.action_space.n agent_cls = agent_factory[method] agent = agent_cls(state_shape, n_actions, environment, episodes, update_rate, step_size=lr_episodes, lr=lr, save_interval=save_interval) # resume from a ckpt if resume is not None: agent.load(resume) avg_reward = MovingAverage(100) avg_loss = MovingAverage(100) log.info(f'Training with {episodes}, starting ...') # main training loop for i in range(episodes): state = game.reset() done = False loss = None while not done: state = game.state action = agent.select_action(state) transition, done = game.step(int(action.to('cpu').numpy())) if len(memory) > batch_size: batched = memory.sample(batch_size) loss = agent.train(batched, batch_size, gamma, i) avg_loss.add(loss) reward = game.rewards # agent.save_best(reward) agent.save() agent.scheduler.step() avg_reward.add(reward) # moving averages text = [ f'steps: {agent.step_cnt}', f'game epochs: {i}/{episodes}', f'train loss: {float(avg_loss):.5}', f'avg reward: {float(avg_reward):.5}', # f'best reward: {float(agent.best_reward):.5}', f'reward: {float(reward):.5}', f'epsilon: {agent.epsilon:.3}', ] log.info(', '.join(text), update=True) if agent.step_cnt % save_interval == 0: history.record({ 'steps': agent.step_cnt, 'avg_reward': float(avg_reward), 'loss': float(avg_loss), }) game.env.close()
def build(self): experiment = Game() experiment.deal() return experiment
# -*- coding: utf-8 -*- from environment import Game from algorithm import QLearn def play(algo, game, steps): for _ in range(steps): s = game.restart() a = algo.choose(s) algo.eligibility_trace *= 0 game.view() while not game.over(): s_, r = game.transition(a) a_ = algo.choose(s_) algo.update(s, a, s_, a_, r) s = s_ a = a_ # print algo.eligibility_trace game.view() # print _ actions = ['u', 'd', 'l', 'r'] algo = QLearn(actions=actions) game = Game() play(algo, game, 100)
import torch import os import pickle import numpy as np from constants import * from environment import Game from model import Actor, Critic from ppo import PPO from utils import plot_data from running_state import * from replay_memory import * env = Game() n_input = env.state_dim actor = Actor(n_input, N_HIDDEN) critic = Critic(n_input, N_HIDDEN) # retrieve previous saved model if exists if os.path.exists(ACTOR_SAVE_PATH): print("Loading saved actor model...") actor.load_state_dict(torch.load(ACTOR_SAVE_PATH)) if os.path.exists(CRITIC_SAVE_PATH): print("Loading saved critic model...") critic.load_state_dict(torch.load(CRITIC_SAVE_PATH)) ppo_agent = PPO(env, actor, critic) running_state = ZFilter((2, ), clip=5)
loss = F.nll_loss(action_r, tocuda(Variable(torch.LongTensor(labels)), cuda)) loss.backward() optim_r.step() else: raise ValueError('Invalid parameter') for i in range(n_games): game_pool[i].reset() return sum(reward) / 32. n_hidden = 200 n_games = args.batch game_pool = [] for i in range(n_games): game_pool.append(Game(n_numbers, n_k)) R = Receiver(n_numbers, n_k, n_hidden) SA = SenderActor(n_numbers, n_k, n_hidden) SC = SenderCritic(n_numbers, n_k, n_hidden) if cuda: R.cuda() SA.cuda() SC.cuda() optim_r = optim.Adam(R.parameters(), lr=1e-3) optim_sa = optim.Adam(SA.parameters(), lr=1e-3) optim_sc = optim.Adam(SC.parameters(), lr=1e-3) running_succ_rate = 1. / n_k for epoch in count(1):
import torch import pickle import os from environment import Game from replay_mem import ReplayMemory from ddpg import DDPG from constants import * # gam environment env = Game() # Create replay memory memory = ReplayMemory(MAX_BUFF_SIZE, env.state_dim, env.n_actions) # DDPG agent agent = DDPG(env, memory) def initialize_replay_mem(): ''' Initialize the replay memory. ''' print("Initializing replay memory...") S = env.reset() for _ in range(MAX_BUFF_SIZE): A = env.sample_action() S_prime, R, is_done = env.take_action(A) memory.add_to_memory((S, A, S_prime, R, is_done)) if is_done:
class controller: def __init__(self): print("init") self.experiments_num = 10 # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate) self.game = Game() self.action_human = 0.0 self.action_agent = 0.0 # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human) # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human) self.agent_sub = rospy.Subscriber("/rl/total_action", action_agent, self.set_action_agent) # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10) self.obs_robot_pub = rospy.Publisher('/rl/game_response', reward_observation, queue_size=10, latch=True) self.transmit_time_list = [] self.rewards_list = [] self.turn_list = [] self.interaction_time_list = [] self.interaction_training_time_list = [] self.mean_list = [] self.stdev_list = [] self.alpha_values = [] self.policy_loss_list = [] self.value_loss_list = [] self.critics_lr_list = [] self.value_critic_lr_list = [] self.actor_lr_list = [] self.plot_directory = package_path + "/src/plots/" if not os.path.exists(self.plot_directory): print("Dir %s was not found. Creating it..." % (self.plot_directory)) os.makedirs(self.plot_directory) def set_action_agent(self, action_agent): self.prev_state = self.game.getState() self.action_agent = action_agent.action[1] self.action_human = action_agent.action[0] self.transmit_time_list.append(rospy.get_rostime().to_sec() - action_agent.header.stamp.to_sec()) def set_human_agent(self, action_agent): if action_agent.action != 0.0: self.action_human = action_agent.action self.transmit_time_list.append(rospy.get_rostime().to_sec() - action_agent.header.stamp.to_sec()) def publish_reward_and_observations(self): h = std_msgs.msg.Header() h.stamp = rospy.Time.now() new_obs = reward_observation() new_obs.header = h new_obs.observations = self.game.getState() new_obs.prev_state = self.prev_state new_obs.final_state = self.game.finished new_obs.reward = self.game.getReward() self.obs_robot_pub.publish(new_obs) def game_loop(self): first_update = True global_time = rospy.get_rostime().to_sec() self.resetGame() for exp in range(MAX_STEPS + 1): if self.game.running: start_interaction_time = time.time() self.game.experiment = exp self.turns += 1 state = self.game.getState() agent_act = self.agent.next_action(state) tmp_time = time.time() act_human = self.action_human while time.time() - tmp_time < action_duration: exec_time = self.game.play([act_human, agent_act.item()]) reward = self.game.getReward() next_state = self.game.getState() done = self.game.finished episode = [state, reward, agent_act, next_state, done] self.agent.update_rw_state(episode) self.total_reward_per_game += reward self.interaction_time_list.append(time.time() - start_interaction_time) # when replay buffer has enough samples update gradient at every turn if len(self.agent.D) >= BATCH_SIZE: if first_update: print("\nStarting updates") first_update = False [ alpha, policy_loss, value_loss, critics_lr, value_critic_lr, actor_lr ] = self.agent.train(sample=episode) self.alpha_values.append(alpha.item()) self.policy_loss_list.append(policy_loss.item()) self.value_loss_list.append(value_loss.item()) self.critics_lr_list.append(critics_lr) self.value_critic_lr_list.append(value_critic_lr) self.actor_lr_list.append(actor_lr) self.interaction_training_time_list.append( time.time() - start_interaction_time) # run "offline_updates_num" offline gradient updates every "UPDATE_INTERVAL" steps if len(self.agent.D ) >= BATCH_SIZE and exp % UPDATE_INTERVAL == 0: print(str(offline_updates_num) + " Gradient upadates") self.game.waitScreen("Training... Please Wait.") pbar = tqdm(xrange(1, offline_updates_num + 1), unit_scale=1, smoothing=0) for _ in pbar: [ alpha, policy_loss, value_loss, critics_lr, value_critic_lr, actor_lr ] = self.agent.train(verbose=False) self.alpha_values.append(alpha.item()) self.policy_loss_list.append(policy_loss.item()) self.value_loss_list.append(value_loss.item()) self.critics_lr_list.append(critics_lr) self.value_critic_lr_list.append(value_critic_lr) self.actor_lr_list.append(actor_lr) # run trials mean_score, stdev_score = self.test() self.mean_list.append(mean_score) self.stdev_list.append(stdev_score) self.resetGame() else: self.turn_list.append(self.turns) self.rewards_list.append(self.total_reward_per_game) # reset game self.resetGame() self.save_and_plot_stats() print( "Average Execution time per interaction: %f milliseconds(stdev: %f). \n" % (mean(self.interaction_time_list) * 1e3, stdev(self.interaction_time_list) * 1e3)) print( "Average Execution time per interaction and online update: %f milliseconds(stdev: %f). \n" % (mean(self.interaction_training_time_list) * 1e3, stdev(self.interaction_training_time_list) * 1e3)) print("Total time of experiments is: %d minutes and %d seconds.\n" % ((rospy.get_rostime().to_sec() - global_time) / 60, (rospy.get_rostime().to_sec() - global_time) % 60)) self.game.endGame() def test(self): score_list = [] for game in range(test_num): score = 200 self.resetGame("Testing Model. Trial %d of %d." % (game + 1, test_num)) while self.game.running: self.game.experiment = "Test: " + str(game + 1) state = self.game.getState() agent_act = self.agent.next_action( state, stochastic=False) # take only the mean # print(agent_act) tmp_time = time.time() while time.time() - tmp_time < 0.2: exec_time = self.game.play( [self.action_human, agent_act.item()], total_games=test_num) score -= 1 score_list.append(score) return [mean(score_list), stdev(score_list)] def resetGame(self, msg=None): wait_time = 3 self.game.waitScreen(msg1="Put Right Wrist on starting point.", msg2=msg, duration=wait_time) self.game = Game() self.action_human = 0.0 self.action_agent = 0.0 self.game.start_time = time.time() self.total_reward_per_game = 0 self.turns = 0 def save_and_plot_stats(self): np.savetxt(self.plot_directory + 'alpha_values.csv', self.alpha_values, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'policy_loss.csv', self.policy_loss_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'value_loss.csv', self.value_loss_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'rewards_list.csv', self.rewards_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'turn_list.csv', self.turn_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'means.csv', self.mean_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'stdev.csv', self.stdev_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'critics_lr_list.csv', self.critics_lr_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'value_critic_lr_list.csv', self.value_critic_lr_list, delimiter=',', fmt='%f') np.savetxt(self.plot_directory + 'actor_lr_list.csv', self.actor_lr_list, delimiter=',', fmt='%f') plot(range(len(self.alpha_values)), self.alpha_values, "alpha_values", 'Alpha Value', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.policy_loss_list)), self.policy_loss_list, "policy_loss", 'Policy loss', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.value_loss_list)), self.value_loss_list, "value_loss_list", 'Value loss', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.rewards_list)), self.rewards_list, "Rewards_per_game", 'Total Rewards per Game', 'Number of Games', self.plot_directory, save=True) plot(range(len(self.turn_list)), self.turn_list, "Steps_per_game", 'Turns per Game', 'Number of Games', self.plot_directory, save=True) plot(range(len(self.critics_lr_list)), self.critics_lr_list, "critics_lr_list", 'Critic lr', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.value_critic_lr_list)), self.value_critic_lr_list, "value_critic_lr_list", 'Value lr', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(len(self.actor_lr_list)), self.actor_lr_list, "actor_lr_list", 'Actor lr', 'Number of Gradient Updates', self.plot_directory, save=True) plot(range(UPDATE_INTERVAL, MAX_STEPS + UPDATE_INTERVAL, UPDATE_INTERVAL), self.mean_list, "trials", 'Tests Score', 'Number of Interactions', self.plot_directory, save=True, variance=True, stdev=self.stdev_list)