Esempio n. 1
0
def autoplay(method, environment, resume, render):
    game = Game(name=environments_to_names[environment], render=render)

    init_state, state_shape = game.get_state(True)
    n_actions = game.env.action_space.n
    agent_cls = agent_factory[method]
    agent = agent_cls(state_shape, n_actions, environment, 1, 1, eps_start=0.0)
    agent.load(resume)

    log.info(f'Evaluating agent, loaded from {resume}, starting ...')

    game.reset()
    state = game.get_state()
    for t in count():
        state = game.get_state()
        action = agent.select_action(state)
        transition, done = game.step(int(action.cpu().numpy()))
        # agent.eval(
        #     transition, 1, 0.0)
        time.sleep(0.1)
        if done:
            log.info(f'agent survived {t} steps')
            game.reset()
            break

    game.env.close()
Esempio n. 2
0
 def teachAgent(self):
     iteration = 0
     while iteration < self.numOfEpisodes:
         game = Game(self.agent)
         game.start(training=True)
         iteration += 1
         if iteration % 10000 == 0:
             print("Training round: " + str(iteration))
     plot_agent_reward(self.agent.rewards)
 def resetGame(self, msg=None):
     wait_time = 3
     self.game.waitScreen(msg1="Put Right Wrist on starting point.",
                          msg2=msg,
                          duration=wait_time)
     self.game = Game()
     self.action_human = 0.0
     self.action_agent = 0.0
     self.game.start_time = time.time()
     self.total_reward_per_game = 0
     self.turns = 0
Esempio n. 4
0
def play(environment):
    game = Game(name=environments_to_names[environment], render=True)
    done = False
    try:
        while not done:
            action = click.prompt('Please enter an action (0, 1, 2, 3..)')
            done = game.step(int(action))
        log.info('[INFO] done ...')
    except KeyboardInterrupt:
        log.info("[INFO] quiting ...")
        exit()
Esempio n. 5
0
 def userPlayAgent(self):
     while True:
         game = Game(self.agent)
         game.start()
         playAgain = input("Would you like to play again? ('y', 'n'): ")
         if playAgain == 'n':
             print("See you later!")
             break
         print()
         print("Okay lets play again!")
         print()
Esempio n. 6
0
class controller:
    def __init__(self):
        print("init")
        self.experiments_num = 10
        # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate)
        self.game = Game()
        self.action_human1 = 0.0
        self.action_human2 = 0.0
        self.human_sub = rospy.Subscriber("/rl/action_x", action_human,
                                          self.set_action_human1)
        self.human_sub = rospy.Subscriber("/rl/action_y", action_human,
                                          self.set_action_human2)

        # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10)
        self.obs_robot_pub = rospy.Publisher('/rl/reward_observation_robot',
                                             reward_observation,
                                             queue_size=10,
                                             latch=True)
        self.transmit_time_list = []

    def set_action_human1(self, act):
        self.action_human1 = act.action
        self.transmit_time_list.append(rospy.get_rostime().to_sec() -
                                       act.header.stamp.to_sec())

    def set_action_human2(self, act):
        self.action_human2 = act.action
        self.transmit_time_list.append(rospy.get_rostime().to_sec() -
                                       act.header.stamp.to_sec())

    def game_loop(self):
        total_time = []
        # print self.action_human
        for exp in range(self.experiments_num):
            print("Experiment %d" % exp)
            while self.game.running:
                exec_time = self.game.play(
                    [self.action_human1, self.action_human2])
                total_time.append(exec_time)

            # reset game
            self.game = Game()
            self.game.start_time = time.time()

        self.game.endGame()

        print(
            "Average Execution time for play funtion is %f milliseconds. \n" %
            (mean(total_time) * 1e3))
        print(
            "Average time from publishing to receiveing is %f milliseconds. \n"
            % (mean(self.transmit_time_list) * 1e3))
 def resetGame(self, msg=None):
     """Resets the Turtle Game."""
     wait_time = 3
     # self.reset_robot_pub.publish(1)
     self.game.waitScreen(msg1="Put Right Wrist on starting point.",
                          msg2=msg,
                          duration=wait_time)
     self.game = Game()
     self.TIME = ACTION_DURATION * INTER_NUM
     self.action_human = 0.0
     self.game.start_time = time.time()
     self.total_reward_per_game = 0
     self.turns = 0
Esempio n. 8
0
    def __init__(self):
        print("init")
        self.experiments_num = 10
        # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate)
        self.game = Game()
        self.action_human1 = 0.0
        self.action_human2 = 0.0
        self.human_sub = rospy.Subscriber("/rl/action_x", action_human,
                                          self.set_action_human1)
        self.human_sub = rospy.Subscriber("/rl/action_y", action_human,
                                          self.set_action_human2)

        # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10)
        self.obs_robot_pub = rospy.Publisher('/rl/reward_observation_robot',
                                             reward_observation,
                                             queue_size=10,
                                             latch=True)
        self.transmit_time_list = []
Esempio n. 9
0
    def game_loop(self):
        total_time = []
        # print self.action_human
        for exp in range(self.experiments_num):
            print("Experiment %d" % (exp + 1))
            while self.game.running:
                exec_time = self.game.play(
                    [self.action_human, self.action_agent])
                total_time.append(exec_time)
                self.publish_reward_and_observations()

            # self.game.endGame()

            # publish last reward and state
            self.publish_reward_and_observations()

            # reset game
            self.game = Game()
            self.game.start_time = time.time()

        self.game.endGame()
Esempio n. 10
0
def run_simulator(
        n_games=1000,
        p1=Player(strategy="basic_q", learning=True),
        p2=Player(strategy="random", learning=False),
        save_Q=False,
):
    """Runs a number of tic tac toe games
	
	Arguments:
		n_games: number of tic tac toe games to be played
		p1: instance of the Player() class, should be used to specify the AI
			during training or testing
		p2: instance of the Player() class, should be used to specify the
			opponent
		save_Q: boolean, determines if p1's updated Q should be saved in a
			pickled file after training is complete

	Returns:
		metrics: dict of P1's wins, losses, and ties

	Use this function to run many games in a row. Depending on the 
	parameters for P1 and P2, this could be used for training or testing.
	This function also specifies the epsilon (exploration factor) decay
	over time as the learner moves from high exploration to low.
	"""
    games = [Game(p1=p1, p2=p2) for i in range(n_games)]
    metrics = []
    index = 0
    starting_epsilon = p1._epsilon

    for game in games:
        index += 1

        # run game and get results + P1's states and actions
        outcome, x_decisions, o_decisions = game.play_game()

        # log game results
        metrics.append(outcome)

        # update q learner after each game
        if p1._learning:
            p1.update_q(outcome, x_decisions)

            # reduce exploration factor after each game
            p1._epsilon = starting_epsilon - starting_epsilon * (1. * index /
                                                                 n_games)**2

    print Counter(metrics)

    # save pickled Q learning file
    if save_Q:
        pickle.dump(p1._Q, open(p1.q_file, "wb"))
    return metrics
Esempio n. 11
0
    def __init__(self):
        print("init")
        self.experiments_num = 10
        # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate)
        self.game = Game()
        self.action_human = 0.0
        self.action_agent = 0.0
        # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human)
        # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human)
        self.agent_sub = rospy.Subscriber("/rl/final_action", action_agent,
                                          self.set_action_agent)

        # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10)
        self.obs_robot_pub = rospy.Publisher('/rl/reward_and_observation_game',
                                             reward_observation,
                                             queue_size=10,
                                             latch=True)

        self.transmit_time_list = []
        self.prev_state = self.game.getState()

        self.publish_reward_and_observations()
Esempio n. 12
0
    def game_loop(self):
        total_time = []
        # print self.action_human
        for exp in range(self.experiments_num):
            print("Experiment %d" % exp)
            while self.game.running:
                exec_time = self.game.play(
                    [self.action_human1, self.action_human2])
                total_time.append(exec_time)

            # reset game
            self.game = Game()
            self.game.start_time = time.time()

        self.game.endGame()

        print(
            "Average Execution time for play funtion is %f milliseconds. \n" %
            (mean(total_time) * 1e3))
        print(
            "Average time from publishing to receiveing is %f milliseconds. \n"
            % (mean(self.transmit_time_list) * 1e3))
    def __init__(self):
        print("init")
        self.experiments_num = 10
        # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate)
        self.game = Game()
        self.action_human = 0.0
        self.action_agent = 0.0
        # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human)
        # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human)
        self.agent_sub = rospy.Subscriber("/rl/total_action", action_agent,
                                          self.set_action_agent)

        # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10)
        self.obs_robot_pub = rospy.Publisher('/rl/game_response',
                                             reward_observation,
                                             queue_size=10,
                                             latch=True)

        self.transmit_time_list = []

        self.rewards_list = []
        self.turn_list = []
        self.interaction_time_list = []
        self.interaction_training_time_list = []
        self.mean_list = []
        self.stdev_list = []
        self.alpha_values = []
        self.policy_loss_list = []
        self.value_loss_list = []
        self.critics_lr_list = []
        self.value_critic_lr_list = []
        self.actor_lr_list = []

        self.plot_directory = package_path + "/src/plots/"
        if not os.path.exists(self.plot_directory):
            print("Dir %s was not found. Creating it..." %
                  (self.plot_directory))
            os.makedirs(self.plot_directory)
Esempio n. 14
0
def main():
    env = Game(fps=fps,
               screensize=screen,
               state=state,
               playersize=player_size,
               playercolor=player_color,
               enemysize=enemy_size,
               enemycolor=enemy_color,
               squaresize=square_size,
               squarecolor=square_color)
    get_ann = {
        "clever": build_ann,
        "forged": forge_ann,
        "keras": keras_ann
    }.get(agent_type, lambda *args: None)

    # the agent name was already taken by the agent module :(
    actor = get_agent(env=env, get_network=get_ann)
    env.reset(actor)
    env.mainloop()
Esempio n. 15
0
def adversarial_training(
        p1=Player(strategy="basic_q", learning=False, load_Q=True),
        p2=Player(strategy="random", learning=False),
        p3=Player(strategy="basic_q", learning=True, load_Q=True),
        p4=Player(strategy="adversarial", learning=False),
):
    """Finds strategies that beat the AI for targeted training

	Arguments:
		p1: trained Q-learner in "test mode" (no learning)
		p2: cpu player choosing random moves
		p3: trained Q-leaner that will continue to train
		p4: p3's opponent, uses saved strategy uncovered by p2 that beat p1

	Plays a trained AI against a random player until (if) the random player
	wins. If the random player does win, it will save that strategy in its
	own Q file. It will then play this adversarial scenario many times so
	that the AI can learn a better strategy.

	Training is very specific and deep (many repetitions), so this is not
	meant to be a general training strategy and is best used after the AI
	is already sufficiently robust. Each time this function runs will only
	cover one adversarial example, so it may need to be run many times.
	"""

    # phase 1: AI vs. random opponent
    max_games = 50000
    games = [Game(p1=p1, p2=p2) for i in range(max_games)]

    for game in games:

        # run game and get results + P1's states and actions
        outcome, x_decisions, o_decisions = game.play_game()

        # build a Q network for player O only where X lost
        if outcome == "lost":

            # phase 2: adversarial training
            print "AI lost. Playing adversarial games..."
            Q_adversarial = {board: action for board, action in o_decisions}
            num_games = 10000
            a_games = [Game(
                p1=p3,
                p2=p4,
            ) for i in range(num_games)]

            starting_epsilon = p3._epsilon

            a_index = 0
            a_metrics = []
            for game in a_games:
                p4._Q = Q_adversarial
                a_index += 1
                outcome, x_decisions, o_decisions = game.play_game()
                a_metrics.append(outcome)
                p3.update_q(outcome, x_decisions)
                p3._epsilon = starting_epsilon - starting_epsilon * (
                    1. * a_index / num_games)**2

            print "Adversarial game outcomes:\n"
            print Counter(a_metrics)

            print "Building better, stronger Q..."
            pickle.dump(p3._Q, open(p3.q_file, "wb"))

            # end training
            return
    print "played %d games without losing!" % max_games
Esempio n. 16
0
def human_vs_human():
    Game(p1=Player(strategy="human"),
         p2=Player(strategy="human"),
         verbose=True).play_game()
Esempio n. 17
0
 def new_game(self):
     return Game(self.action_space_size, self.discount)
Esempio n. 18
0
#     r,g,b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
#     return 0.2989 * r + 0.5870 * g + 0.1140 * b

# def shapeState(img):
#     # resize image and make grayscale
#     resized_gray = rgb2gray(scipy.misc.imresize(img,(64,64)))/ 255.0
#    shaped = resized_gray.reshape(1,64,64,1)
#    return shaped


def shapeState(img):
    return img.reshape(1, 64, 64, 1)


# env = gym.make('CartPole-v1')
env = Game()
next_state, reward, done = env.render()

# feed 64 by 64 grayscale images into CNN
state_size = (64, 64)
action_size = len(env.keylist)
agent = Player(state_size, action_size)
done = False
batch_size = 32

max_score = 0
frames = []
best = []

print('get game window in focus')
for i in list(range(4))[::-1]:
    def __init__(self):
        # print("init")
        self.game = Game()  # initialize a turtle game instance
        self.agent = SAC()  # initialize a soft actor critic agent
        """
		initialize human & agent actions
		"""
        self.action_human = 0.0
        self.action_agent = 0.0
        """
		Create subscribers for human action
		act_human_sub_y is for the case that the agent's action is not used
		"""
        self.act_human_sub = rospy.Subscriber("/rl/action_x", action_msg,
                                              self.set_human_action)
        self.act_human_sub_y = rospy.Subscriber("/rl/action_y", action_msg,
                                                self.set_human_action_y)
        """
		Create publishers for turtle's acceleration, agent's action, 
		robot reset signal and turtle's position on x axis
		"""
        self.turtle_accel_pub = rospy.Publisher("/rl/turtle_accel",
                                                Float32,
                                                queue_size=10)
        self.act_agent_pub = rospy.Publisher("/rl/action_y",
                                             action_msg,
                                             queue_size=10)
        self.reset_robot_pub = rospy.Publisher("/robot_reset",
                                               Int16,
                                               queue_size=1)
        self.turtle_state_pub = rospy.Publisher("/rl/turtle_pos_X",
                                                Int16,
                                                queue_size=10)
        """
		Initialize statistics lists
		"""
        self.transmit_time_list = []
        self.agent_act_list = []
        self.human_act_list = []
        self.action_timesteps = []
        self.exec_time_list = []
        self.act_human_list = []
        self.real_act_list = []
        self.time_real_act_list = []
        self.time_turtle_pos = []
        self.time_turtle_vel = []
        self.time_turtle_acc = []
        self.exec_time = None
        self.action_human_timestamp = 0
        self.human_action_delay_list = []
        self.used_human_act_list = []
        self.used_human_act_time_list = []
        self.position_timesteps = []
        self.fps_list = []
        self.action_human_y = 0
        self.human_act_list_total = []
        self.agent_act_list_total = []
        self.interaction_counter = 0
        self.iter_num = 0
        self.run_num = 1
        """
		Initialize RL lists
		"""
        self.rewards_list = []
        self.turn_list = []
        self.interaction_time_list = []
        self.interaction_training_time_list = []
        self.mean_list = []
        self.stdev_list = []
        self.alpha_values = []
        self.policy_loss_list = []
        self.value_loss_list = []
        self.q_loss_list = []
        self.critics_lr_list = []
        self.value_critic_lr_list = []
        self.actor_lr_list = []
        self.trials_list = []
        """
		Initialize turtle's monitoring lists
		"""
        self.turtle_pos_x = []
        self.turtle_vel_x = []
        self.turtle_acc_x = []
        self.turtle_pos_y = []
        self.turtle_vel_y = []
        self.turtle_acc_y = []
        self.turtle_pos_x_dict = {}
        """
		Initialize Paths & Dirs
		"""
        rospack = rospkg.RosPack()
        package_path = rospack.get_path("collaborative_games")

        self.plot_directory = package_path + "/src/plots/" + control_mode + "/"
        if not os.path.exists(self.plot_directory):
            print("Dir %s was not found. Creating it..." %
                  (self.plot_directory))
            os.makedirs(self.plot_directory)

        exp_num = 1
        while 1:
            if os.path.exists(self.plot_directory + "Experiment_" +
                              str(exp_num) + "/"):
                exp_num += 1
            else:
                self.plot_directory += "Experiment_" + str(exp_num) + "/"
                os.makedirs(self.plot_directory)
                break

        if not os.path.exists(self.plot_directory + 'rl_dynamics/'):
            print("Dir %s was not found. Creating it..." %
                  (self.plot_directory + 'rl_dynamics/'))
            os.makedirs(self.plot_directory + 'rl_dynamics/')
Esempio n. 20
0
def main():
    game = Game()
    env_size = 4
    m = MultiAgentCNN("pursuers", env_size)
    # target_m = MultiAgentCNN("target")
    ml_simulator = TfSimulator(4, m)  #, target_m)

    epochs = 100
    sim_per_epoch = 20
    steps_per_run = 25

    memory_size = 20000

    subsample = 2

    epsilon = 1.2
    base_cap = 1

    saver = 1000

    run_dir = '{}_{}_{}_{}'.format(time(), epochs, sim_per_epoch,
                                   steps_per_run)
    print('Saving to {}'.format(run_dir))
    #os.mkdir('animations/'+run_dir)

    decay = (1 - 6e-5)
    decayer = 1.

    loss_history = []

    memory_actions = []
    memory_states = []
    memory_rewards = []
    memory_end_states = []

    total_steps_performed = 0

    # Run epochs until 2.5M examples have been visited

    for radius in range(1, 4):
        for ep in range(epochs):
            decayer = 1.
            if total_steps_performed > 4000000:
                break
            states = []
            rewards = []
            actions = []
            end_states = []

            for sim in range(sim_per_epoch):
                # print("\tRun {}".format(sim))
                n_bots = (sim % 3 + 2) % len(ml_simulator.env.bots)

                # Initialize randomly target
                pos = ml_simulator.env.rand_position()
                ml_simulator.env.rand_initialise_within_radius(
                    radius, n_bots, pos, bot_on_radius=True)

                decayer *= decay
                base_cap0 = base_cap * decayer

                if len(memory_rewards) > memory_size / 2:
                    epsilon = epsilon * decay

                # Baseline vs Model
                #if np.random.uniform(0, 1) < :
                if np.random.uniform(0, 1) < base_cap0:
                    s, r, a, e_s = game.get_simulation_history(
                        bot=0, N=steps_per_run, subsample=subsample)
                else:
                    s, r, a, e_s = ml_simulator.run_simulation(
                        0, steps_per_run, subsample=subsample, epsilon=epsilon)

                if s is None or r is None or a is None:
                    continue

                s = np.asarray(s)
                r = np.asarray(r)
                a = np.asarray(a)
                e_s = np.asarray(e_s)

                # Soft training
                try:
                    statesTrain = np.concatenate(s, axis=0)
                    rewardsTrain = np.concatenate(r, axis=0)
                    actionsTrain = np.concatenate(a, axis=0)
                    endStateTrain = np.concatenate(e_s, axis=0)

                    if statesTrain.shape[1:] != (env_size, env_size, 3):
                        statesTrain = statesTrain.transpose((0, 2, 3, 1))
                    if endStateTrain.shape[1:] != (env_size, env_size, 3):
                        endStateTrain = endStateTrain.transpose((0, 2, 3, 1))

                    ml_simulator.model.train(statesTrain, rewardsTrain,
                                             actionsTrain, endStateTrain)

                    # Expand memory DB
                    rewards += [s]
                    actions += [r]
                    states += [s]
                    end_states += [e_s]
                except Exception as e:
                    # print("Error:", e)
                    continue

            try:
                _ = np.concatenate(states, axis=0)
                _ = np.concatenate(rewards, axis=0)
                _ = np.concatenate(actions, axis=0)
            except Exception as e:
                print("Concat error:", e)
                continue

            n = len(rewards)
            total_steps_performed += n

            if len(memory_rewards) > memory_size:
                memory_rewards = memory_rewards[n * 2:] + rewards
                memory_actions = memory_actions[n * 2:] + actions
                memory_states = memory_states[n * 2:] + states
                memory_end_states = memory_end_states[n * 2:] + end_states
            else:
                memory_rewards = memory_rewards + rewards
                memory_actions = memory_actions + actions
                memory_states = memory_states + states
                memory_end_states = memory_end_states + end_states

                print("\t~ Building memory DB", len(memory_rewards))

            try:
                statesTrain = np.concatenate(memory_states, axis=0)
                rewardsTrain = np.concatenate(memory_rewards, axis=0)
                actionsTrain = np.concatenate(memory_actions, axis=0)
                endStateTrain = np.concatenate(memory_end_states, axis=0)
            except Exception as e:
                print("Concat error:", e)
                continue

            env_shape = ml_simulator.env.verticeMatrix.shape
            while statesTrain.shape[1] != env_shape[0] or statesTrain.shape[
                    2] != env_shape[1] or statesTrain.shape[3] != 3:
                statesTrain = statesTrain.transpose((0, 2, 3, 1))

            while endStateTrain.shape[1] != env_shape[0] or endStateTrain.shape[
                    2] != env_shape[1] or endStateTrain.shape[3] != 3:
                endStateTrain = endStateTrain.transpose((0, 2, 3, 1))

            # Hard training
            save = (total_steps_performed > saver)
            if save:
                saver += 2500

            print("\t- Memory Training...")
            loss = ml_simulator.model.train(statesTrain,
                                            rewardsTrain,
                                            actionsTrain,
                                            endStateTrain,
                                            save=save)
            print('\tLoss: {}'.format(loss))
            if save:
                ml_simulator.run_and_plot(run_dir, ep, radius)
            loss_history.append(loss)

    m.save()
    ml_simulator.run_and_plot(run_dir, 'FINAL')

    plt.plot(np.arange(epochs), loss_history)
    plt.title("Loss over epochs")
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.show()
Esempio n. 21
0
def main():
    game = Game()
    env_size = 4
    m = MultiAgentCNN("pursuers", env_size)
    # target_m = MultiAgentCNN("target")
    ml_simulator = TfSimulator(4, m)  #, target_m)

    eps_per_radius = 50
    sim_per_epoch = 25
    steps_per_run = 15
    radius_top = 25

    memory_size = 10000

    subsample = 2
    epsilon = 1.
    base_cap = 0.0

    save_top = 2500
    total_steps_performed = 0

    run_dir = '{}_{}_{}'.format(time(), sim_per_epoch, steps_per_run)
    print('Saving to {}'.format(run_dir))
    os.mkdir('animations/' + run_dir)

    decay = (1 - 5e-5)

    loss_history = []

    memory_actions = []
    memory_states = []
    memory_rewards = []
    memory_end_states = []

    for radius in range(1, radius_top):
        for ep in range(eps_per_radius):
            print("EPOCH {} ---- Radius {}".format(ep, radius))

            states = []
            rewards = []
            actions = []
            end_states = []

            for sim in range(sim_per_epoch):
                n_bots = (sim % (len(ml_simulator.env.bots) - 1) + 1)
                pos = ml_simulator.env.rand_position()
                ml_simulator.env.rand_initialise_within_radius(
                    radius, n_bots, pos, bot_on_radius=True)

                base_cap *= decay
                if len(memory_rewards) > memory_size * 0.9:
                    epsilon *= decay

                if np.random.uniform(0, 1) < base_cap:
                    s, r, a, e_s = game.get_simulation_history(bot=0,
                                                               N=steps_per_run,
                                                               subsample=0)
                else:
                    s, r, a, e_s = ml_simulator.run_simulation(
                        0, steps_per_run, subsample=subsample, epsilon=epsilon)

                if s is None or r is None or a is None:
                    continue

                s = np.asarray(s)
                r = np.asarray(r)
                a = np.asarray(a)
                e_s = np.asarray(e_s)

                if s.shape[1:] != (env_size, env_size, 3):
                    s = s.transpose((0, 2, 3, 1))
                if e_s.shape[1:] != (env_size, env_size, 3):
                    e_s = e_s.transpose((0, 2, 3, 1))

                states.append(s)
                rewards.append(r)
                actions.append(a)
                end_states.append(e_s)

            # Soft train
            try:
                s_train = np.concatenate(states, axis=0)
                r_train = np.concatenate(rewards, axis=0)
                a_train = np.concatenate(actions, axis=0)
                e_s_train = np.concatenate(end_states, axis=0)
            except Exception as e:
                print("Soft", e)
                continue
            ml_simulator.model.train(s_train, r_train, a_train, e_s_train)

            n = len(r_train)
            total_steps_performed += n

            if len(memory_rewards) > memory_size:
                memory_rewards = memory_rewards[n * 2:] + rewards
                memory_actions = memory_actions[n * 2:] + actions
                memory_states = memory_states[n * 2:] + states
                memory_end_states = memory_end_states[n * 2:] + end_states
            else:
                memory_rewards = memory_rewards + rewards
                memory_actions = memory_actions + actions
                memory_states = memory_states + states
                memory_end_states = memory_end_states + end_states

                print("\t~ Building memory DB", len(memory_rewards))

        # Hard Training
        try:
            statesTrain = np.concatenate(memory_states, axis=0)
            rewardsTrain = np.concatenate(memory_rewards, axis=0)
            actionsTrain = np.concatenate(memory_actions, axis=0)
            endStateTrain = np.concatenate(memory_end_states, axis=0)
        except Exception as e:
            print("Concat error:", e)
            continue

        env_shape = ml_simulator.env.verticeMatrix.shape
        while statesTrain.shape[1] != env_shape[0] or statesTrain.shape[
                2] != env_shape[1] or statesTrain.shape[3] != 3:
            statesTrain = statesTrain.transpose((0, 2, 3, 1))

        while endStateTrain.shape[1] != env_shape[0] or endStateTrain.shape[2] != env_shape[1] or \
          endStateTrain.shape[
           3] != 3:
            endStateTrain = endStateTrain.transpose((0, 2, 3, 1))

        save = (total_steps_performed > save_top)
        print("\t- Memory Training...")
        loss = ml_simulator.model.train(statesTrain,
                                        rewardsTrain,
                                        actionsTrain,
                                        endStateTrain,
                                        save=save)
        print('\tLoss: {}'.format(loss))
        loss_history.append(loss)

        if save:
            save_top += 2500
            plot_history(loss_history)

        if save:
            ml_simulator.run_and_plot(run_dir, radius, radius)

    m.save()
    ml_simulator.run_and_plot(run_dir, 'FINAL')

    plot_history(loss_history)
Esempio n. 22
0
def test_learner_against_human():
    # tests Q-learner against human player ("O")
    p1 = Player(strategy="basic_q", load_Q=True)
    p2 = Player(strategy="human")
    game = Game(p1=p1, p2=p2, verbose=True)
    game.play_game()
class controller:
    def __init__(self):
        # print("init")
        self.game = Game()  # initialize a turtle game instance
        self.agent = SAC()  # initialize a soft actor critic agent
        """
		initialize human & agent actions
		"""
        self.action_human = 0.0
        self.action_agent = 0.0
        """
		Create subscribers for human action
		act_human_sub_y is for the case that the agent's action is not used
		"""
        self.act_human_sub = rospy.Subscriber("/rl/action_x", action_msg,
                                              self.set_human_action)
        self.act_human_sub_y = rospy.Subscriber("/rl/action_y", action_msg,
                                                self.set_human_action_y)
        """
		Create publishers for turtle's acceleration, agent's action, 
		robot reset signal and turtle's position on x axis
		"""
        self.turtle_accel_pub = rospy.Publisher("/rl/turtle_accel",
                                                Float32,
                                                queue_size=10)
        self.act_agent_pub = rospy.Publisher("/rl/action_y",
                                             action_msg,
                                             queue_size=10)
        self.reset_robot_pub = rospy.Publisher("/robot_reset",
                                               Int16,
                                               queue_size=1)
        self.turtle_state_pub = rospy.Publisher("/rl/turtle_pos_X",
                                                Int16,
                                                queue_size=10)
        """
		Initialize statistics lists
		"""
        self.transmit_time_list = []
        self.agent_act_list = []
        self.human_act_list = []
        self.action_timesteps = []
        self.exec_time_list = []
        self.act_human_list = []
        self.real_act_list = []
        self.time_real_act_list = []
        self.time_turtle_pos = []
        self.time_turtle_vel = []
        self.time_turtle_acc = []
        self.exec_time = None
        self.action_human_timestamp = 0
        self.human_action_delay_list = []
        self.used_human_act_list = []
        self.used_human_act_time_list = []
        self.position_timesteps = []
        self.fps_list = []
        self.action_human_y = 0
        self.human_act_list_total = []
        self.agent_act_list_total = []
        self.interaction_counter = 0
        self.iter_num = 0
        self.run_num = 1
        """
		Initialize RL lists
		"""
        self.rewards_list = []
        self.turn_list = []
        self.interaction_time_list = []
        self.interaction_training_time_list = []
        self.mean_list = []
        self.stdev_list = []
        self.alpha_values = []
        self.policy_loss_list = []
        self.value_loss_list = []
        self.q_loss_list = []
        self.critics_lr_list = []
        self.value_critic_lr_list = []
        self.actor_lr_list = []
        self.trials_list = []
        """
		Initialize turtle's monitoring lists
		"""
        self.turtle_pos_x = []
        self.turtle_vel_x = []
        self.turtle_acc_x = []
        self.turtle_pos_y = []
        self.turtle_vel_y = []
        self.turtle_acc_y = []
        self.turtle_pos_x_dict = {}
        """
		Initialize Paths & Dirs
		"""
        rospack = rospkg.RosPack()
        package_path = rospack.get_path("collaborative_games")

        self.plot_directory = package_path + "/src/plots/" + control_mode + "/"
        if not os.path.exists(self.plot_directory):
            print("Dir %s was not found. Creating it..." %
                  (self.plot_directory))
            os.makedirs(self.plot_directory)

        exp_num = 1
        while 1:
            if os.path.exists(self.plot_directory + "Experiment_" +
                              str(exp_num) + "/"):
                exp_num += 1
            else:
                self.plot_directory += "Experiment_" + str(exp_num) + "/"
                os.makedirs(self.plot_directory)
                break

        if not os.path.exists(self.plot_directory + 'rl_dynamics/'):
            print("Dir %s was not found. Creating it..." %
                  (self.plot_directory + 'rl_dynamics/'))
            os.makedirs(self.plot_directory + 'rl_dynamics/')
        # if not os.path.exists(self.plot_directory + "turtle_dynamics/"):
        # 	print("Dir %s was not found. Creating it..." %(self.plot_directory + 'turtle_dynamics/'))
        # 	os.makedirs(self.plot_directory + 'turtle_dynamics/')

    def set_human_action(self, action_human):
        """
		Gets the human action from the publisher.
		"""
        if action_human.action != 0.0:
            self.action_human = action_human.action
            self.action_human_timestamp = action_human.header.stamp.to_sec()
            self.time_real_act_list.append(self.action_human_timestamp)
            self.real_act_list.append(self.action_human)

            self.transmit_time_list.append(rospy.get_rostime().to_sec() -
                                           action_human.header.stamp.to_sec())

    def set_human_action_y(self, action_human):
        """
		Gets the human action from the publisher.
		"""
        if action_human.action != 0.0:
            self.action_human_y = action_human.action

    def game_loop(self):
        """
		The main loop of the Collaborative Game.
		This loop contains the training and the testing for the collaborative game experiment		
		"""
        first_update = True
        global_time = rospy.get_rostime().to_sec()

        # Run a testing session before starting training
        self.reset_lists()
        # score_list =  self.test()
        # self.mean_list.append(mean(score_list))
        # self.stdev_list.append(stdev(score_list))
        # self.trials_list.append(score_list)

        # Run trials
        self.resetGame()
        self.interaction_counter = 0
        for exp in range(MAX_STEPS + 1):

            self.check_goal_reached()

            if self.game.running:  # checks if the game has not finished yet
                self.interaction_counter += 1

                start_interaction_time = time.time()

                self.game.experiment = exp
                self.turns += 1

                state = self.getState()

                # checks if agent or human controls the y axis
                if use_agent:
                    agent_act = self.agent.next_action(state)
                else:
                    agent_act = self.action_human_y

                self.turtle_state_pub.publish(state[2])
                self.publish_agent_action(agent_act)

                tmp_time = time.time()
                act_human = self.action_human  # self.action_human is changing while the loop is running

                human_act_timestamp = self.action_human_timestamp
                self.used_human_act_list.append(act_human)
                self.used_human_act_time_list.append(tmp_time)
                """This loop ensures that an action will be excexuted for the ACTION_DURATION"""
                count = 0
                while ((time.time() - tmp_time) <
                       action_duration) and self.game.running:
                    count += 1
                    # control mode is "accel" or "vel"
                    self.exec_time = self.game.play(
                        [act_human, agent_act.item()],
                        control_mode=control_mode)
                    self.human_action_delay_list.append(time.time() -
                                                        human_act_timestamp)
                    # exec_time = self.game.play([act_human, 0],control_mode=control_mode)

                    self.agent_act_list.append(agent_act.item())
                    self.human_act_list.append(act_human)
                    self.action_timesteps.append(tmp_time)

                    self.save_stats()
                    self.check_goal_reached()

                # print count
                """ Retrive RLself.information."""
                reward = self.getReward()
                next_state = self.getState()
                done = self.game.finished
                episode = [state, reward, agent_act, next_state, done]

                self.agent.update_rw_state(episode)

                self.total_reward_per_game += reward

                self.interaction_time_list.append(time.time() -
                                                  start_interaction_time)

                # when replay buffer has enough samples update gradient at every turn
                if first_update and len(
                        self.agent.D) >= BATCH_SIZE and exp > UPDATE_START:

                    if first_update:
                        print("\nStarting updates")
                        first_update = False

                    if use_agent:
                        [
                            alpha, policy_loss, value_loss, q_loss, critics_lr,
                            value_critic_lr, actor_lr
                        ] = self.agent.train(sample=episode)
                        self.save_rl_data(alpha, policy_loss, value_loss,
                                          q_loss, critics_lr, value_critic_lr,
                                          actor_lr)

                        self.interaction_training_time_list.append(
                            time.time() - start_interaction_time)

                # run "offline_updates_num" offline gradient updates every "UPDATE_INTERVAL" steps
                elif not first_update and len(
                        self.agent.D
                ) >= BATCH_SIZE and exp % UPDATE_INTERVAL == 0:

                    print(str(offline_updates_num) + " Gradient upadates")
                    self.game.waitScreen("Training... Please Wait.")

                    pbar = tqdm(xrange(1, offline_updates_num + 1),
                                unit_scale=1,
                                smoothing=0)
                    for _ in pbar:
                        if use_agent:
                            [
                                alpha, policy_loss, value_loss, q_loss,
                                critics_lr, value_critic_lr, actor_lr
                            ] = self.agent.train(verbose=False)
                            self.interaction_training_time_list.append(
                                time.time() - start_interaction_time)

                            self.save_rl_data(alpha, policy_loss, value_loss,
                                              q_loss, critics_lr,
                                              value_critic_lr, actor_lr)

                    # # run testing trials
                    self.reset_lists()
                    score_list = self.test()

                    self.mean_list.append(mean(score_list))
                    self.stdev_list.append(stdev(score_list))
                    self.trials_list.append(score_list)

                    self.resetGame()
            else:  #the game has finished
                self.save_and_plot_stats_environment(self.run_num)
                self.run_num += 1
                self.turn_list.append(self.turns)
                self.rewards_list.append(self.total_reward_per_game)

                # reset game
                self.resetGame()

        self.save_and_plot_stats_rl()

        print(
            "Average Human Action Transmission time per interaction: %f milliseconds(stdev: %f). \n"
            % (mean(self.transmit_time_list) * 1e3,
               stdev(self.transmit_time_list) * 1e3))
        print(
            "Average Execution time per interaction: %f milliseconds(stdev: %f). \n"
            % (mean(self.interaction_time_list) * 1e3,
               stdev(self.interaction_time_list) * 1e3))
        if use_agent:
            print(
                "Average Execution time per interaction and online update: %f milliseconds(stdev: %f). \n"
                % (mean(self.interaction_training_time_list) * 1e3,
                   stdev(self.interaction_training_time_list) * 1e3))

        print("Total time of experiments is: %d minutes and %d seconds.\n" %
              ((rospy.get_rostime().to_sec() - global_time) / 60,
               (rospy.get_rostime().to_sec() - global_time) % 60))

        self.game.endGame()

    def test(self):
        """ Performs the testing. The average value of the policy is used as the action and no sampling is being performed."""
        score_list = []
        self.iter_num += 1
        for game in range(test_num):
            score = INTER_NUM

            self.resetGame("Testing Model. Trial %d of %d." %
                           (game + 1, test_num))
            self.interaction_counter = 0
            while self.game.running:
                self.interaction_counter += 1
                self.game.experiment = "Test: " + str(game + 1)

                state = self.getState()

                if use_agent:
                    agent_act = self.agent.next_action(
                        state, stochastic=False)  # take only the mean
                else:
                    agent_act = self.action_human_y

                act_human = self.action_human

                tmp_time = time.time()
                self.agent_act_list.append(agent_act.item())
                self.human_act_list.append(act_human)
                self.action_timesteps.append(tmp_time)
                # print(agent_act)
                self.publish_agent_action(agent_act)

                while ((time.time() - tmp_time) <
                       action_duration) and self.game.running:
                    self.exec_time = self.game.play(
                        [act_human, agent_act.item()],
                        total_games=test_num,
                        control_mode=control_mode)
                    self.save_stats()
                    self.check_goal_reached()

                score -= 1
                self.check_goal_reached()

            score_list.append(score)
            self.save_and_plot_stats_environment("Test_" + str(self.iter_num) +
                                                 "_" + str(game))

        return score_list

    def save_stats(self):
        """Saves all the turtle-potition-relevant statistics for later analysis."""
        self.turtle_accel_pub.publish(self.game.accel_x)

        self.turtle_pos_x.append(self.game.real_turtle_pos[0])
        self.turtle_pos_y.append(self.game.real_turtle_pos[1])
        self.time_turtle_pos.append(rospy.get_rostime().to_sec())

        self.turtle_vel_x.append(self.game.vel_x)
        self.turtle_vel_y.append(self.game.vel_y)
        self.time_turtle_vel.append(rospy.get_rostime().to_sec())

        self.turtle_acc_x.append(self.game.accel_x)
        self.turtle_acc_y.append(self.game.accel_y)
        self.time_turtle_acc.append(rospy.get_rostime().to_sec())

        self.fps_list.append(self.game.current_fps)

        self.turtle_pos_x_dict[self.game.turtle_pos[0]] = self.exec_time

        self.exec_time_list.append(self.exec_time)

    def getReward(self):
        """Calculates the reward for the SAC agent"""
        if self.game.finished:
            if self.game.timedOut and REWARD_TYPE == "penalty":
                return -50
            else:
                return 100
        else:
            return -1

    def getState(self):
        """Returns the state of the turtle game."""
        return [
            self.game.accel_x, self.game.accel_y, self.game.turtle_pos[0],
            self.game.turtle_pos[1], self.game.vel_x, self.game.vel_y
        ]

    def check_goal_reached(self, name="turtle"):
        """Checks if the goal of the turtle game has been reached."""
        if name is "turtle":
            if self.game.time_dependend and (
                    self.interaction_counter % INTER_NUM
                    == 0) and self.interaction_counter != 0:
                self.game.running = 0
                self.game.exitcode = 1
                self.game.timedOut = True
                self.game.finished = True
                self.interaction_counter = 0

            # if self.game.width - 40 > self.game.turtle_pos[0] > self.game.width - (80 + 40) \
            # and 20 < self.game.turtle_pos[1] < (80 + 60 / 2 - 32):

            # if (self.game.width - 160) <= self.game.turtle_pos[0] <= (self.game.width - 60 - 64) and 40 <= self.game.turtle_pos[1] <= (140 - 64):
            # 	self.game.running = 0
            # 	self.game.exitcode = 1
            # 	self.game.finished = True  # This means final state achieved
            # 	self.interaction_counter = 0
            if (self.game.width - 160) <= self.game.turtle_pos[0] + 32 <= (
                    self.game.width -
                    60) and 40 <= self.game.turtle_pos[1] + 32 <= 140:
                self.game.running = 0
                self.game.exitcode = 1
                self.game.finished = True  # This means final state achieved
                self.interaction_counter = 0

    def resetGame(self, msg=None):
        """Resets the Turtle Game."""
        wait_time = 3
        # self.reset_robot_pub.publish(1)
        self.game.waitScreen(msg1="Put Right Wrist on starting point.",
                             msg2=msg,
                             duration=wait_time)
        self.game = Game()
        self.TIME = ACTION_DURATION * INTER_NUM
        self.action_human = 0.0
        self.game.start_time = time.time()
        self.total_reward_per_game = 0
        self.turns = 0
        # self.reset_robot_pub.publish(1)

    def save_and_plot_stats_rl(self):
        """Saves & plots all the rl-relevant statistics for later analysis."""

        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'alpha_values.csv',
                   self.alpha_values,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'policy_loss.csv',
                   self.policy_loss_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'value_loss.csv',
                   self.value_loss_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'q_loss.csv',
                   self.q_loss_list,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'rewards_list.csv',
                   self.rewards_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'turn_list.csv',
                   self.turn_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'means.csv',
                   self.mean_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'stdev.csv',
                   self.stdev_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' +
                   'critics_lr_list.csv',
                   self.critics_lr_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' +
                   'value_critic_lr_list.csv',
                   self.value_critic_lr_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'actor_lr_list.csv',
                   self.actor_lr_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rl_dynamics/' + 'trials_list.csv',
                   self.trials_list,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + 'agent_act_list_total.csv',
                   self.agent_act_list_total,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'human_act_list_total.csv',
                   self.human_act_list_total,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + 'human_action_delay_list.csv',
                   self.human_action_delay_list,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + 'fps_list.csv',
                   self.fps_list,
                   delimiter=',',
                   fmt='%f')
        plot_hist(self.fps_list, self.plot_directory,
                  'fps_list_' + str(self.game.fps))

        np.savetxt(self.plot_directory + 'exec_time_list.csv',
                   self.exec_time_list,
                   delimiter=',',
                   fmt='%f')

        plot(range(len(self.alpha_values)),
             self.alpha_values,
             "alpha_values",
             'Alpha Value',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.policy_loss_list)),
             self.policy_loss_list,
             "policy_loss",
             'Policy loss',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.value_loss_list)),
             self.value_loss_list,
             "value_loss_list",
             'Value loss',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.rewards_list)),
             self.rewards_list,
             "Rewards_per_game",
             'Total Rewards per Game',
             'Number of Games',
             self.plot_directory,
             save=True)
        plot(range(len(self.turn_list)),
             self.turn_list,
             "Steps_per_game",
             'Turns per Game',
             'Number of Games',
             self.plot_directory,
             save=True)

        plot(range(len(self.critics_lr_list)),
             self.critics_lr_list,
             "critics_lr_list",
             'Critic lr',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.value_critic_lr_list)),
             self.value_critic_lr_list,
             "value_critic_lr_list",
             'Value lr',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.actor_lr_list)),
             self.actor_lr_list,
             "actor_lr_list",
             'Actor lr',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)

        try:
            plot(range(UPDATE_INTERVAL, MAX_STEPS + UPDATE_INTERVAL,
                       UPDATE_INTERVAL),
                 self.mean_list,
                 "trials",
                 'Tests Score',
                 'Number of Interactions',
                 self.plot_directory,
                 save=True,
                 variance=True,
                 stdev=self.stdev_list)
        except:
            print("Trials did not ploted")

        plot_hist(self.agent_act_list_total,
                  self.plot_directory + 'agent_act_hist_total',
                  'Agent Action Histogram')
        plot_hist(self.human_act_list_total,
                  self.plot_directory + 'human_act_hist_total',
                  'Human Action Histogram')

        # human_action_delay_list_new = [elem for elem in self.human_action_delay_list if elem <0.8] # remove outliers caused by saving and plotting functions
        plot_hist(self.human_action_delay_list,
                  self.plot_directory + 'human_action_delay_list',
                  'Human Action Delay Histogram')

    def save_and_plot_stats_environment(self, run_num):
        """Saves all environment-relevant statistics for later analysis."""

        run_subfolder = "game_" + str(run_num) + "/"
        os.makedirs(self.plot_directory + run_subfolder + "turtle_dynamics/")

        self.exp_details()

        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'agent_act_list.csv',
                   self.agent_act_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'human_act_list.csv',
                   self.human_act_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'action_timesteps.csv',
                   self.action_timesteps,
                   delimiter=',',
                   fmt='%f')
        human_act_list = np.genfromtxt(self.plot_directory + run_subfolder +
                                       "turtle_dynamics/" +
                                       'human_act_list.csv',
                                       delimiter=',')
        agent_act_list = np.genfromtxt(self.plot_directory + run_subfolder +
                                       "turtle_dynamics/" +
                                       'agent_act_list.csv',
                                       delimiter=',')
        plot_hist(
            agent_act_list, self.plot_directory + run_subfolder +
            "turtle_dynamics/" + 'agent_act_hist_'
            "game_" + str(run_num), 'Agent Action Histogram')
        plot_hist(
            human_act_list, self.plot_directory + run_subfolder +
            "turtle_dynamics/" + 'human_act_hist_'
            "game_" + str(run_num), 'Human Action Histogram')

        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'turtle_pos_x.csv',
                   self.turtle_pos_x,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'turtle_pos_y.csv',
                   self.turtle_pos_y,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'time_turtle_pos.csv',
                   self.time_turtle_pos,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'turtle_vel_x.csv',
                   self.turtle_vel_x,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'turtle_vel_y.csv',
                   self.turtle_vel_y,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'time_turtle_vel.csv',
                   self.time_turtle_vel,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'turtle_accel_x.csv',
                   self.turtle_acc_x,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'turtle_accel_y.csv',
                   self.turtle_acc_y,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'time_turtle_acc.csv',
                   self.time_turtle_acc,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'real_act_list.csv',
                   self.real_act_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'time_real_act_list.csv',
                   self.time_real_act_list,
                   delimiter=',',
                   fmt='%f')

        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'used_human_act_list.csv',
                   self.used_human_act_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + run_subfolder + "turtle_dynamics/" +
                   'used_human_act_time_list.csv',
                   self.used_human_act_time_list,
                   delimiter=',',
                   fmt='%f')

        subplot(self.plot_directory + run_subfolder + "turtle_dynamics/",
                self.turtle_pos_x, self.turtle_vel_x, self.turtle_acc_x,
                self.time_turtle_pos, self.time_turtle_vel,
                self.time_turtle_acc, human_act_list, self.action_timesteps,
                "x", control_mode)
        subplot(self.plot_directory + run_subfolder + "turtle_dynamics/",
                self.turtle_pos_y, self.turtle_vel_y, self.turtle_acc_y,
                self.time_turtle_pos, self.time_turtle_vel,
                self.time_turtle_acc, agent_act_list, self.action_timesteps,
                "y", control_mode)

        plt.figure("Real_Human_Actions_Comparison", figsize=(25, 10))
        plt.grid()

        plt.ylabel('Human Actions')
        plt.xlabel('Msg Timestamp(seconds)')
        # plt.xticks(plt.xticks(np.arange(min(min(time_real_act_list),min(self.used_human_act_time_list)), max(max(time_real_act_list),max(used_human_act_time_list)), 1)))
        plt.scatter(self.time_real_act_list, self.real_act_list)
        plt.scatter(self.used_human_act_time_list, self.used_human_act_list)
        plt.savefig(self.plot_directory + run_subfolder +
                    "human_real_action_comparison",
                    dpi=150)

        self.reset_lists()

    def publish_agent_action(self, agent_act):
        """Publishes Agent's action."""
        h = std_msgs.msg.Header()
        h.stamp = rospy.Time.now()

        act = action_msg()
        act.action = agent_act
        act.header = h

        self.act_agent_pub.publish(act)

    def reset_lists(self):
        """Reseting funstion for the list."""
        self.human_act_list_total.extend(self.human_act_list)
        self.agent_act_list_total.extend(self.agent_act_list)

        self.agent_act_list = []
        self.human_act_list = []
        self.action_timesteps = []
        self.turtle_pos_x = []
        self.turtle_pos_y = []
        self.time_turtle_pos = []
        self.turtle_vel_x = []
        self.turtle_vel_y = []
        self.time_turtle_vel = []
        self.turtle_acc_x = []
        self.turtle_acc_y = []
        self.time_turtle_acc = []
        self.real_act_list = []
        self.time_real_act_list = []

        self.used_human_act_list = []
        self.used_human_act_time_list = []

    def save_rl_data(self, alpha, policy_loss, value_loss, q_loss, critics_lr,
                     value_critic_lr, actor_lr):
        """RL data saver."""
        self.alpha_values.append(alpha.item())
        self.policy_loss_list.append(policy_loss.item())
        self.value_loss_list.append(value_loss.item())
        self.q_loss_list.append(q_loss.item())

        self.critics_lr_list.append(critics_lr)
        self.value_critic_lr_list.append(value_critic_lr)
        self.actor_lr_list.append(actor_lr)

    def exp_details(self):
        """Keeps track of the experiment details & saves them for later reference at the experiment."""
        exp_details = {}
        exp_details["MAX_STEPS"] = MAX_STEPS
        exp_details["CONTROL_MODE"] = CONTROL_MODE
        exp_details["ACCEL_RATE"] = ACCEL_RATE
        exp_details["ACTION_DURATION"] = ACTION_DURATION
        exp_details["BATCH_SIZE"] = BATCH_SIZE
        exp_details["REPLAY_SIZE"] = REPLAY_SIZE
        exp_details["OFFLINE_UPDATES"] = OFFLINE_UPDATES
        exp_details["UPDATE_START"] = UPDATE_START

        csv_file = "exp_details.csv"
        try:
            w = csv.writer(open(self.plot_directory + csv_file, "w"))
            for key, val in exp_details.items():
                w.writerow([key, val])
        except IOError:
            print("I/O error")
Esempio n. 24
0
class controller:
    def __init__(self):
        print("init")
        self.experiments_num = 10
        # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate)
        self.game = Game()
        self.action_human = 0.0
        self.action_agent = 0.0
        # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human)
        # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human)
        self.agent_sub = rospy.Subscriber("/rl/final_action", action_agent,
                                          self.set_action_agent)

        # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10)
        self.obs_robot_pub = rospy.Publisher('/rl/reward_and_observation_game',
                                             reward_observation,
                                             queue_size=10,
                                             latch=True)

        self.transmit_time_list = []
        self.prev_state = self.game.getState()

        self.publish_reward_and_observations()

    # def set_action_human(self,action_human):
    # 	self.action_human = action_human.action
    # 	self.transmit_time_list.append(rospy.get_rostime().to_sec()  - action_human.header.stamp.to_sec())

    def set_action_agent(self, action_agent):
        self.prev_state = self.game.getState()
        self.action_agent = action_agent.action[1]
        self.action_human = action_agent.action[0]
        self.transmit_time_list.append(rospy.get_rostime().to_sec() -
                                       action_agent.header.stamp.to_sec())

    def publish_reward_and_observations(self):
        h = std_msgs.msg.Header()
        h.stamp = rospy.Time.now()

        new_obs = reward_observation()
        new_obs.header = h
        new_obs.observations = self.game.getState()
        new_obs.prev_state = self.prev_state
        new_obs.final_state = self.game.finished
        new_obs.reward = self.game.getReward()

        self.obs_robot_pub.publish(new_obs)

    def game_loop(self):
        total_time = []
        # print self.action_human
        for exp in range(self.experiments_num):
            print("Experiment %d" % (exp + 1))
            while self.game.running:
                exec_time = self.game.play(
                    [self.action_human, self.action_agent])
                total_time.append(exec_time)
                self.publish_reward_and_observations()

            # self.game.endGame()

            # publish last reward and state
            self.publish_reward_and_observations()

            # reset game
            self.game = Game()
            self.game.start_time = time.time()

        self.game.endGame()
Esempio n. 25
0
def train(method, environment, resume, episodes, lr, lr_episodes, min_lr,
          eval_only, replay_width, batch_size, gamma, update_rate,
          save_interval):

    history = History(method + '_' + environment,
                      ['steps', 'avg_reward', 'loss'], resume is not None)
    history.flush()
    memory = ReplayMemory(replay_width)
    game = Game(name=environments_to_names[environment],
                memory=memory,
                render=False)
    init_state, state_shape = game.get_state(True)
    n_actions = game.env.action_space.n
    agent_cls = agent_factory[method]
    agent = agent_cls(state_shape,
                      n_actions,
                      environment,
                      episodes,
                      update_rate,
                      step_size=lr_episodes,
                      lr=lr,
                      save_interval=save_interval)

    # resume from a ckpt
    if resume is not None:
        agent.load(resume)

    avg_reward = MovingAverage(100)
    avg_loss = MovingAverage(100)

    log.info(f'Training with {episodes}, starting ...')

    # main training loop
    for i in range(episodes):
        state = game.reset()
        done = False
        loss = None
        while not done:
            state = game.state
            action = agent.select_action(state)

            transition, done = game.step(int(action.to('cpu').numpy()))

            if len(memory) > batch_size:
                batched = memory.sample(batch_size)
                loss = agent.train(batched, batch_size, gamma, i)
                avg_loss.add(loss)
        reward = game.rewards
        # agent.save_best(reward)
        agent.save()
        agent.scheduler.step()
        avg_reward.add(reward)

        # moving averages
        text = [
            f'steps: {agent.step_cnt}',
            f'game epochs: {i}/{episodes}',
            f'train loss: {float(avg_loss):.5}',
            f'avg reward: {float(avg_reward):.5}',
            # f'best reward: {float(agent.best_reward):.5}',
            f'reward: {float(reward):.5}',
            f'epsilon: {agent.epsilon:.3}',
        ]
        log.info(', '.join(text), update=True)
        if agent.step_cnt % save_interval == 0:
            history.record({
                'steps': agent.step_cnt,
                'avg_reward': float(avg_reward),
                'loss': float(avg_loss),
            })

    game.env.close()
Esempio n. 26
0
 def build(self):
     experiment = Game()
     experiment.deal()
     return experiment
Esempio n. 27
0
# -*- coding: utf-8 -*-
from environment import Game
from algorithm import QLearn


def play(algo, game, steps):
    for _ in range(steps):
        s = game.restart()
        a = algo.choose(s)
        algo.eligibility_trace *= 0
        game.view()
        while not game.over():
            s_, r = game.transition(a)
            a_ = algo.choose(s_)
            algo.update(s, a, s_, a_, r)
            s = s_
            a = a_
            # print algo.eligibility_trace
            game.view()
        # print _


actions = ['u', 'd', 'l', 'r']
algo = QLearn(actions=actions)
game = Game()
play(algo, game, 100)
Esempio n. 28
0
import torch

import os
import pickle
import numpy as np

from constants import *
from environment import Game
from model import Actor, Critic
from ppo import PPO
from utils import plot_data
from running_state import *
from replay_memory import *

env = Game()
n_input = env.state_dim

actor = Actor(n_input, N_HIDDEN)
critic = Critic(n_input, N_HIDDEN)

# retrieve previous saved model if exists
if os.path.exists(ACTOR_SAVE_PATH):
    print("Loading saved actor model...")
    actor.load_state_dict(torch.load(ACTOR_SAVE_PATH))
if os.path.exists(CRITIC_SAVE_PATH):
    print("Loading saved critic model...")
    critic.load_state_dict(torch.load(CRITIC_SAVE_PATH))

ppo_agent = PPO(env, actor, critic)

running_state = ZFilter((2, ), clip=5)
        loss = F.nll_loss(action_r, tocuda(Variable(torch.LongTensor(labels)), cuda))
        loss.backward()
        optim_r.step()
    else:
        raise ValueError('Invalid parameter')
    
    for i in range(n_games):
        game_pool[i].reset()
    
    return sum(reward) / 32.

n_hidden = 200
n_games = args.batch
game_pool = []
for i in range(n_games):
    game_pool.append(Game(n_numbers, n_k))

R = Receiver(n_numbers, n_k, n_hidden)
SA = SenderActor(n_numbers, n_k, n_hidden)
SC = SenderCritic(n_numbers, n_k, n_hidden)
if cuda:
    R.cuda()
    SA.cuda()
    SC.cuda()

optim_r = optim.Adam(R.parameters(), lr=1e-3)
optim_sa = optim.Adam(SA.parameters(), lr=1e-3)
optim_sc = optim.Adam(SC.parameters(), lr=1e-3)

running_succ_rate = 1. / n_k
for epoch in count(1):
Esempio n. 30
0
import torch

import pickle
import os

from environment import Game
from replay_mem import ReplayMemory
from ddpg import DDPG
from constants import *

# gam environment
env = Game()

# Create replay memory
memory = ReplayMemory(MAX_BUFF_SIZE, env.state_dim, env.n_actions)

# DDPG agent
agent = DDPG(env, memory)


def initialize_replay_mem():
    '''
    Initialize the replay memory.
    '''
    print("Initializing replay memory...")
    S = env.reset()
    for _ in range(MAX_BUFF_SIZE):
        A = env.sample_action()
        S_prime, R, is_done = env.take_action(A)
        memory.add_to_memory((S, A, S_prime, R, is_done))
        if is_done:
class controller:
    def __init__(self):
        print("init")
        self.experiments_num = 10
        # self.human_sub = rospy.Subscriber("/RW_x_direction", Int16, self.simulate)
        self.game = Game()
        self.action_human = 0.0
        self.action_agent = 0.0
        # # self.human_sub = rospy.Subscriber("/rl/hand_action_x", action_msg, self.set_action_human)
        # self.human_sub = rospy.Subscriber("/rl/action_x", action_msg, self.set_action_human)
        self.agent_sub = rospy.Subscriber("/rl/total_action", action_agent,
                                          self.set_action_agent)

        # self.reward_pub = rospy.Publisher('/rl/reward', Int16, queue_size = 10)
        self.obs_robot_pub = rospy.Publisher('/rl/game_response',
                                             reward_observation,
                                             queue_size=10,
                                             latch=True)

        self.transmit_time_list = []

        self.rewards_list = []
        self.turn_list = []
        self.interaction_time_list = []
        self.interaction_training_time_list = []
        self.mean_list = []
        self.stdev_list = []
        self.alpha_values = []
        self.policy_loss_list = []
        self.value_loss_list = []
        self.critics_lr_list = []
        self.value_critic_lr_list = []
        self.actor_lr_list = []

        self.plot_directory = package_path + "/src/plots/"
        if not os.path.exists(self.plot_directory):
            print("Dir %s was not found. Creating it..." %
                  (self.plot_directory))
            os.makedirs(self.plot_directory)

    def set_action_agent(self, action_agent):
        self.prev_state = self.game.getState()
        self.action_agent = action_agent.action[1]
        self.action_human = action_agent.action[0]
        self.transmit_time_list.append(rospy.get_rostime().to_sec() -
                                       action_agent.header.stamp.to_sec())

    def set_human_agent(self, action_agent):
        if action_agent.action != 0.0:
            self.action_human = action_agent.action
            self.transmit_time_list.append(rospy.get_rostime().to_sec() -
                                           action_agent.header.stamp.to_sec())

    def publish_reward_and_observations(self):
        h = std_msgs.msg.Header()
        h.stamp = rospy.Time.now()

        new_obs = reward_observation()
        new_obs.header = h
        new_obs.observations = self.game.getState()
        new_obs.prev_state = self.prev_state
        new_obs.final_state = self.game.finished
        new_obs.reward = self.game.getReward()

        self.obs_robot_pub.publish(new_obs)

    def game_loop(self):
        first_update = True
        global_time = rospy.get_rostime().to_sec()

        self.resetGame()

        for exp in range(MAX_STEPS + 1):

            if self.game.running:
                start_interaction_time = time.time()

                self.game.experiment = exp
                self.turns += 1

                state = self.game.getState()
                agent_act = self.agent.next_action(state)

                tmp_time = time.time()
                act_human = self.action_human

                while time.time() - tmp_time < action_duration:
                    exec_time = self.game.play([act_human, agent_act.item()])

                reward = self.game.getReward()
                next_state = self.game.getState()
                done = self.game.finished
                episode = [state, reward, agent_act, next_state, done]

                self.agent.update_rw_state(episode)

                self.total_reward_per_game += reward

                self.interaction_time_list.append(time.time() -
                                                  start_interaction_time)

                # when replay buffer has enough samples update gradient at every turn
                if len(self.agent.D) >= BATCH_SIZE:

                    if first_update:
                        print("\nStarting updates")
                        first_update = False

                    [
                        alpha, policy_loss, value_loss, critics_lr,
                        value_critic_lr, actor_lr
                    ] = self.agent.train(sample=episode)
                    self.alpha_values.append(alpha.item())
                    self.policy_loss_list.append(policy_loss.item())
                    self.value_loss_list.append(value_loss.item())
                    self.critics_lr_list.append(critics_lr)
                    self.value_critic_lr_list.append(value_critic_lr)
                    self.actor_lr_list.append(actor_lr)

                    self.interaction_training_time_list.append(
                        time.time() - start_interaction_time)

                # run "offline_updates_num" offline gradient updates every "UPDATE_INTERVAL" steps
                if len(self.agent.D
                       ) >= BATCH_SIZE and exp % UPDATE_INTERVAL == 0:

                    print(str(offline_updates_num) + " Gradient upadates")
                    self.game.waitScreen("Training... Please Wait.")

                    pbar = tqdm(xrange(1, offline_updates_num + 1),
                                unit_scale=1,
                                smoothing=0)
                    for _ in pbar:
                        [
                            alpha, policy_loss, value_loss, critics_lr,
                            value_critic_lr, actor_lr
                        ] = self.agent.train(verbose=False)
                        self.alpha_values.append(alpha.item())
                        self.policy_loss_list.append(policy_loss.item())
                        self.value_loss_list.append(value_loss.item())
                        self.critics_lr_list.append(critics_lr)
                        self.value_critic_lr_list.append(value_critic_lr)
                        self.actor_lr_list.append(actor_lr)

                    # run trials
                    mean_score, stdev_score = self.test()

                    self.mean_list.append(mean_score)
                    self.stdev_list.append(stdev_score)

                    self.resetGame()
            else:
                self.turn_list.append(self.turns)
                self.rewards_list.append(self.total_reward_per_game)

                # reset game
                self.resetGame()

        self.save_and_plot_stats()

        print(
            "Average Execution time per interaction: %f milliseconds(stdev: %f). \n"
            % (mean(self.interaction_time_list) * 1e3,
               stdev(self.interaction_time_list) * 1e3))
        print(
            "Average Execution time per interaction and online update: %f milliseconds(stdev: %f). \n"
            % (mean(self.interaction_training_time_list) * 1e3,
               stdev(self.interaction_training_time_list) * 1e3))

        print("Total time of experiments is: %d minutes and %d seconds.\n" %
              ((rospy.get_rostime().to_sec() - global_time) / 60,
               (rospy.get_rostime().to_sec() - global_time) % 60))

        self.game.endGame()

    def test(self):

        score_list = []
        for game in range(test_num):
            score = 200

            self.resetGame("Testing Model. Trial %d of %d." %
                           (game + 1, test_num))

            while self.game.running:
                self.game.experiment = "Test: " + str(game + 1)

                state = self.game.getState()
                agent_act = self.agent.next_action(
                    state, stochastic=False)  # take only the mean
                # print(agent_act)
                tmp_time = time.time()
                while time.time() - tmp_time < 0.2:
                    exec_time = self.game.play(
                        [self.action_human,
                         agent_act.item()],
                        total_games=test_num)

                score -= 1

            score_list.append(score)

        return [mean(score_list), stdev(score_list)]

    def resetGame(self, msg=None):
        wait_time = 3
        self.game.waitScreen(msg1="Put Right Wrist on starting point.",
                             msg2=msg,
                             duration=wait_time)
        self.game = Game()
        self.action_human = 0.0
        self.action_agent = 0.0
        self.game.start_time = time.time()
        self.total_reward_per_game = 0
        self.turns = 0

    def save_and_plot_stats(self):
        np.savetxt(self.plot_directory + 'alpha_values.csv',
                   self.alpha_values,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'policy_loss.csv',
                   self.policy_loss_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'value_loss.csv',
                   self.value_loss_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'rewards_list.csv',
                   self.rewards_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'turn_list.csv',
                   self.turn_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'means.csv',
                   self.mean_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'stdev.csv',
                   self.stdev_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'critics_lr_list.csv',
                   self.critics_lr_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'value_critic_lr_list.csv',
                   self.value_critic_lr_list,
                   delimiter=',',
                   fmt='%f')
        np.savetxt(self.plot_directory + 'actor_lr_list.csv',
                   self.actor_lr_list,
                   delimiter=',',
                   fmt='%f')

        plot(range(len(self.alpha_values)),
             self.alpha_values,
             "alpha_values",
             'Alpha Value',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.policy_loss_list)),
             self.policy_loss_list,
             "policy_loss",
             'Policy loss',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.value_loss_list)),
             self.value_loss_list,
             "value_loss_list",
             'Value loss',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.rewards_list)),
             self.rewards_list,
             "Rewards_per_game",
             'Total Rewards per Game',
             'Number of Games',
             self.plot_directory,
             save=True)
        plot(range(len(self.turn_list)),
             self.turn_list,
             "Steps_per_game",
             'Turns per Game',
             'Number of Games',
             self.plot_directory,
             save=True)

        plot(range(len(self.critics_lr_list)),
             self.critics_lr_list,
             "critics_lr_list",
             'Critic lr',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.value_critic_lr_list)),
             self.value_critic_lr_list,
             "value_critic_lr_list",
             'Value lr',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)
        plot(range(len(self.actor_lr_list)),
             self.actor_lr_list,
             "actor_lr_list",
             'Actor lr',
             'Number of Gradient Updates',
             self.plot_directory,
             save=True)

        plot(range(UPDATE_INTERVAL, MAX_STEPS + UPDATE_INTERVAL,
                   UPDATE_INTERVAL),
             self.mean_list,
             "trials",
             'Tests Score',
             'Number of Interactions',
             self.plot_directory,
             save=True,
             variance=True,
             stdev=self.stdev_list)