Esempio n. 1
0
def train_model():

    # Initiates the env
    env = gym.make('Mario-Kart-Luigi-Raceway-v0')

    resolution = (120, 160)

    actions = [
        [-60, 0, 1, 0, 0],  # left
        [60, 0, 1, 0, 0],  # right
        [0, -80, 0, 1, 0],  # back
        [0, 0, 1, 0, 0]
    ]  # go straight
    # [  0,   0, 0, 1, 0]]             # brake

    # Initiates Model
    model = DQNModel(resolution=resolution,
                     nb_frames=learn_param['nb_frames'],
                     actions=actions)

    # print("number of actions: ", len(doom.actions))   # 16

    if model_weights:
        model.load_weights(model_weights)

    agent = RLAgent(model, **learn_param)

    # Preform Reinforcement Learning on Scenario
    agent.train(env)
Esempio n. 2
0
def train_model():
    '''
    Method trains primitive DQN-Model.

    '''
    # Initiates VizDoom Scenario
    doom = DoomScenario(scenario)
    # print(doom.get_processed_state(depth_radius, depth_contrast).shape[-2:])
    # Initiates Model
    model = DQNModel(resolution=doom.get_processed_state(
        depth_radius, depth_contrast).shape[-2:],
                     nb_frames=learn_param['nb_frames'],
                     actions=doom.actions,
                     depth_radius=depth_radius,
                     depth_contrast=depth_contrast)

    # print("number of actions: ", len(doom.actions))   # 16

    if model_weights:
        print("with a pretrained weights-------by amber")
        model.load_weights(model_weights)
    agent = RLAgent(model, **learn_param)

    # Preform Reinforcement Learning on Scenario
    agent.train(doom)
Esempio n. 3
0
def main():
    config = dict()
    config['lr'] = 0.0000001
    config['stocks'] = ['a', 'aa']
    config['stock_num'] = len(config['stocks'])

    tf.reset_default_graph()

    agent = RLAgent(config)
    agent.RL_train()
Esempio n. 4
0
def test_model(runs=1):
    '''
    Method used to test DQN models on VizDoom scenario. Testing run are replayed
    in higher resolution (800X600).

    Param:

    runs - int : number of test runs done on model.

    '''
    # Initiates VizDoom Scenario
    doom = DoomScenario(scenario)

    # Load Model and Weights
    model = DQNModel(resolution=doom.get_processed_state(
        depth_radius, depth_contrast).shape[-2:],
                     nb_frames=test_param['nb_frames'],
                     actions=doom.actions,
                     depth_radius=depth_radius,
                     depth_contrast=depth_contrast)
    model.load_weights(model_weights)
    agent = RLAgent(model, **test_param)

    print("\nTesting DQN-Model:", model_weights)
    # Run Scenario and play replay
    for i in range(runs):
        doom = DoomScenario(scenario)
        doom.run(agent, save_replay='test.lmp', verbose=True)
        doom.replay('test.lmp', doom_like=False)
Esempio n. 5
0
 def __init__(self, _id, tsc_data, conn, args, exp_replay, neural_networks,
              eps, rl_stats, reward):
     super(RLTrafficSignalController,
           self).__init__(_id, tsc_data, conn, args)
     self.rlagent = RLAgent(neural_networks, eps, exp_replay,
                            tsc_data['n_green_phases'], args.n_steps,
                            args.batch, args.replay, args.gamma)
     ###set intersection to red default
     self.id = _id
     #self.phase_buffer = deque()
     self.exp = {}
     self.current_phase = tsc_data['all_red']
     self.args = args
     self.phase_deque = deque()
     self.state_deque = deque()
     self.acting = False
     self.rl_stats = rl_stats
     self.reward = reward
     self.rewards = []
Esempio n. 6
0
    def __init__(self, env, gamma, learning_rate, epsilon, epsilon_min,
                 epsilon_decay, divisor, buckets, training_episodes,
                 testing_episodes, frames):

        RLAgent.__init__(self, env, training_episodes, testing_episodes,
                         frames)
        self.env = env
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.divisor = divisor
        self.buckets = (
            3,
            3,
            6,
            6,
        )

        self.Q = np.zeros(self.buckets + (self.env.action_space.n, ))
    def __init__(self, screen, speed, use_keras=True, pre_trained=True):
        super().__init__(screen)
        self.total_training_games = TOTAL_TRAINING_GAMES
        # takes in x, y of the snake and the speed of the snake
        self.agent = RLAgent(speed, use_keras, pre_trained)
        self.go_through_boundary = True
        # total number of games required to train
        self.idle_frames = 0
        self.total_steps = 0

        # # try to load the numpy data, if not possible then set the training_data to an empty list
        # try:
        #     # loaded training_data needs to be converted into a list
        #     self.training_data = np.load("./rl-learning-data/rl-data.npy").tolist()
        #     print("Training data loaded from disk...")
        # except:
        #     print("Training data couldn't be loaded from disk...")
        #     self.training_data = []

        # uncomment this if you decide to load data
        self.training_data = []
Esempio n. 8
0
    def __init__(self, env, config, epsilon, training_episodes,
                 testing_episodes, frames):

        RLAgent.__init__(self, env, training_episodes, testing_episodes,
                         frames)

        self.epsilon = epsilon
        self.name = config.name

        self.action_space_dim = self.env.action_space.n
        self.observation_space_dim = self.env.observation_space.shape[0]

        # Config has all hyperparameters stored.
        self.config = config

        self.memory = deque(maxlen=self.config.memory_size)
        self.replay_counter = 0

        # Keep track of how many frames the model ran through in total.
        self.training_frame_count = 0

        self.model = self.initialize_model()
Esempio n. 9
0
def run_weights():

    env = gym.make('Mario-Kart-Luigi-Raceway-v0')

    resolution = (120, 160)

    actions = [
        [-60, 0, 1, 0, 0],  # left
        [60, 0, 1, 0, 0],  # right
        [0, -80, 0, 1, 0],  # back
        [0, 0, 1, 0, 0]
    ]  # go straight
    # [  0,   0, 0, 1, 0]]             # brake

    # Load Model and Weights
    model = DQNModel(resolution=resolution,
                     nb_frames=test_param['nb_frames'],
                     actions=actions)

    model.load_weights(model_weights)

    agent = RLAgent(model, **test_param)

    agent.test(env)
Esempio n. 10
0
def start_game():
    # p1 = KickAI(gateway)
    # p1 = MCTS(gateway)
    p1 = RLAgent(gateway)
    p2 = Machete(gateway)
    # p2 = DisplayInfo(gateway)
    manager.registerAI(p1.__class__.__name__, p1)
    manager.registerAI(p2.__class__.__name__, p2)
    print("Start game")

    game = manager.createGame("ZEN", "ZEN", p1.__class__.__name__,
                              p2.__class__.__name__, GAME_NUM)
    manager.runGame(game)

    print("After game")
    sys.stdout.flush()
Esempio n. 11
0
def show_policy(rl_agent: RLAgent, size: int = 8) -> None:
    """
    Visualize agent policy for FrozenLake environments.

    Prints characters that looks like maximum action for each possible state. Description:
    '<' - left
    '^' - up
    '>' - right
    '.' - down

    Args:
        rl_agent: Trained agent.
        size: Size of area in chosen environment along single dimension.
    """
    actions_viz = {
        0: "<",
        1: ".",
        2: ">",
        3: "^",
    }
    actions = [rl_agent.get_action(i) for i in range(size**2)]
    viz = "".join(list(map(actions_viz.get, actions)))  # type: ignore
    for i in range(size):
        print(viz[i * size:(i + 1) * size])
from PyQt5.QtWidgets import QApplication
from Window import Window
from EmulatorInterface import EmulatorInterface
from RLAgent import RLAgent

import sys
'''
    Creates a global Agent object in order
    to train the model
'''
agent = RLAgent()
'''
    :desc:
        This is the update function that can be
        used each frame to take control of the emulator.
        Below are some examples in the comments where you can
        use an emulator object to map key presses
        to the 
'''


def onUpdate(window, emulator):
    src = window.grabScreenshot()
    global agent
    '''
        TODO:
            Here we can simulate the AI pressing the buttons.
            Examples:
            
            emulator.emulatePress("throttle") # Emulates driving forward
            emulator.emulatePress("right") # Emulates steering to the right
Esempio n. 13
0
# Initiates the env
env = gym.make('Mario-Kart-Luigi-Raceway-v0')

resolution = (120, 160)

actions = [
    [-60, 0, 1, 0, 0],  # left
    [60, 0, 1, 0, 0],  # right
    [0, -80, 0, 1, 0],  # back
    [0, 0, 1, 0, 0]
]  # go straight
# [  0,   0, 0, 1, 0]]             # brake

# Initiates Model
model = DQNModel(resolution=resolution,
                 nb_frames=learn_param['nb_frames'],
                 actions=actions)

# print("number of actions: ", len(doom.actions))   # 16

if model_weights:
    model.load_weights(model_weights)
else:
    print("Please provide a model_weights file")

agent = RLAgent(model, **learn_param)

# give a step number randomly to catch a random screen shot
agent.visualize(env)
Esempio n. 14
0
max_epsilon=1.0
no_episodes=[1000,2000,3000,4000,5000]

wonRL_alpha=[]

def plot(list2):
    print(alpha)
    print(list2)
    # Plot
    plt.plot(epsilon, list2)
    plt.xlim(0,1)
    plt.ylim(100,2500)
    # naming the x axis
    plt.xlabel('epsilon')
    # naming the y axis
    plt.ylabel('No times RL agent won')
    # function to show the plot
    plt.show()



for i in range(len(epsilon)):
    player1 = RLAgent(alpha[0], gamma[4], epsilon[i], exploration_decay_rate, min_epsilon, max_epsilon)
    player2 = RandomAgent()
    Env = Environment(player1, player2, no_episodes[4])
    wonRL_alpha.append(Env.trainAgent())
    Env.saveIntoFile(i)
plot(wonRL_alpha)


class RLAgentPlayer(Player):
    def __init__(self, screen, speed, use_keras=True, pre_trained=True):
        super().__init__(screen)
        self.total_training_games = TOTAL_TRAINING_GAMES
        # takes in x, y of the snake and the speed of the snake
        self.agent = RLAgent(speed, use_keras, pre_trained)
        self.go_through_boundary = True
        # total number of games required to train
        self.idle_frames = 0
        self.total_steps = 0

        # # try to load the numpy data, if not possible then set the training_data to an empty list
        # try:
        #     # loaded training_data needs to be converted into a list
        #     self.training_data = np.load("./rl-learning-data/rl-data.npy").tolist()
        #     print("Training data loaded from disk...")
        # except:
        #     print("Training data couldn't be loaded from disk...")
        #     self.training_data = []

        # uncomment this if you decide to load data
        self.training_data = []

    def consumption_check(self):
        if collision(self.agent.body, self.food_stack[0]):
            return True
        else:
            return False

    def display_info(self, score, high_score):
        pygame.font.init()

        default_font = pygame.font.get_default_font()
        font_renderer = pygame.font.Font(default_font, 10)

        # To create a surface containing `Some Text`
        label = font_renderer.render("Score - {}, High score - {}".format(
            score, high_score), 1, (0, 0, 0))  # RGB Color
        self.screen.blit(label, (0, 0))

    def get_angle(self):

        head_x = self.agent.body.get_x()
        head_y = self.agent.body.get_y()

        segment_x = self.agent.body.body[0].get_x()
        segment_y = self.agent.body.body[0].get_y()

        food_x = self.food_stack[0].get_x()
        food_y = self.food_stack[0].get_y()

        snake_direction = np.array([head_x, head_y]) - np.array(
            [segment_x, segment_y])
        food_direction = np.array([food_x, food_y]) - np.array(
            [head_x, head_y])

        a = snake_direction / np.linalg.norm(snake_direction)
        b = food_direction / np.linalg.norm(food_direction)

        return math.atan2(a[0] * b[1] - a[1] * b[0],
                          a[0] * b[0] + a[1] * b[1]) / math.pi

    def map_keys(self, pred):
        if self.agent.body.current_direction == "right":
            # left from current point of view
            if pred == -1:
                self.agent.body.change_direction("up")
            # right from current point of view
            elif pred == 1:
                self.agent.body.change_direction("down")

        elif self.agent.body.current_direction == "left":
            # left from current point of view
            if pred == -1:
                self.agent.body.change_direction("down")
            # right from current point of view
            elif pred == 1:
                self.agent.body.change_direction("up")

        elif self.agent.body.current_direction == "up":
            # left from current point of view
            if pred == -1:
                self.agent.body.change_direction("left")
            # right from current point of view
            elif pred == 1:
                self.agent.body.change_direction("right")

        elif self.agent.body.current_direction == "down":
            # left from current point of view
            if pred == -1:
                self.agent.body.change_direction("right")
            # right from current point of view
            elif pred == 1:
                self.agent.body.change_direction("left")

    def get_input_data(self):
        # all the prediction of the next frame's collision movements
        coll_pred = self.agent.body.self_collision_prediction()
        # get distance from the snake and food
        distance_from_food = self.agent.body.distance_from_food(
            self.food_stack[0])
        angle = self.get_angle()
        return [coll_pred[0], coll_pred[1], coll_pred[2], angle]

    def gather_training_data(self, total_training_games=TOTAL_TRAINING_GAMES):
        self.total_training_games = total_training_games
        self.wrong_turn = 0
        self.wrong_direction = 0
        self.right_direction = 0

        for _ in tqdm(range(self.total_training_games)):
            self.one_game_iteration()

        # uncomment to save the training data

        # print("Training data saved to disk...")
        # # save the numpy data
        # np.save("./rl-learning-data/rl-data.npy", self.training_data)

        average_steps = self.total_steps / self.total_training_games

        print("Total Number of right directions: {}".format(
            self.right_direction))
        print("Total Number of wrong directions: {}".format(
            self.wrong_direction))
        print("Total Number of wrong turns: {}".format(self.wrong_turn))
        print("Average frames rendered per game: {}".format(average_steps))

    def train_agent(self):
        print("Begining to train with {} data".format(len(self.training_data)))
        self.agent.learn(self.training_data)

    def one_game_iteration(self):
        score = 3
        prev_food_distance = self.agent.body.distance_from_food(
            self.food_stack[0])
        prev_nn_data = self.get_input_data()
        # end dictates if the game has finished or not, initially it will be false
        end = False
        # the game is played until it is ended, so till the snake either hits the wall or collides with itself
        while not end:
            self.total_steps += 1
            end, curr_nn_data, current_action = self.render_training_frame()
            prev_nn_data.append(current_action)
            prev_nn_data = np.array(prev_nn_data)
            if end:
                self.wrong_turn += 1
                self.training_data.append([prev_nn_data, -1])
                # when game ends we reconstruct the body of the snake
                self.agent.create_new_body()
                self.spawn_food()
                break
            else:
                food_distance = self.agent.body.distance_from_food(
                    self.food_stack[0])
                if self.agent.body.score > score or food_distance < prev_food_distance:
                    self.right_direction += 1
                    self.training_data.append([prev_nn_data, 1])
                else:
                    self.wrong_direction += 1
                    self.training_data.append([prev_nn_data, 0])
                prev_food_distance = food_distance
                prev_nn_data = curr_nn_data
                score = self.agent.body.score
        # make end to what its initial state was
        end = False
        # end of each game a new body is created
        self.agent.create_new_body()
        self.spawn_food()

    def process_training_data(self, right_direction, wrong_direction):
        new_training_data = []
        to_match = 0
        for i in range(len(self.training_data)):
            if self.training_data[i][1] == 1:
                to_match += 1
                if to_match != wrong_direction:
                    new_training_data.append(self.training_data[i])
            else:
                new_training_data.append(self.training_data[i])

        return new_training_data

    def render_training_frame(self):
        pygame.event.pump()

        for food in self.food_stack:
            food.draw(self.screen)

        action = random.randint(-1, 2)

        self.map_keys(action)

        end = self.agent.body.draw(self.screen, self.go_through_boundary)

        # check here if the snake ate the food
        if self.consumption_check():
            self.spawn_food()
            # finally we grow the snake as well by adding a new segment to the snake's body
            self.agent.body.grow()

        nn_data = self.get_input_data()

        return end, nn_data, action

    def kill_idle_game(self):
        self.idle_frames += 1
        if self.idle_frames == 400:
            self.idle_frames = 0
            return True

    def use_brain_to_move(self):
        prev_nn_data = self.get_input_data()

        predictions = []
        # all three possible directions are generated and for all possible direction values given those inputs
        # we ask the neural network which direction to step to for positive effect
        for action in range(-1, 2):
            nn_data = self.get_input_data()
            nn_data.append(action)
            nn_data = np.array(nn_data)
            # depending on previous observation what move should i generate
            predictions.append(self.agent.predict(nn_data))

        action = np.argmax(np.array(predictions))

        # to map the range value
        action -= 1

        self.map_keys(action)

    def test_agent(self, dataset_games):
        game_iterations = 100
        total_score = 0
        high_score = 0

        max_step = 1000

        for _ in tqdm(range(game_iterations)):
            step = 0
            while True:
                step += 1
                score = self.agent.body.score

                for food in self.food_stack:
                    food.draw(self.screen)

                self.use_brain_to_move()

                end = self.agent.body.draw(self.screen,
                                           self.go_through_boundary)

                # when the snake dies and the game ends
                if end or step == max_step:
                    if score > high_score:
                        high_score = score

                    total_score += score
                    # break the loop when the game ends
                    self.agent.create_new_body()
                    break

                # check here if the snake ate the food
                if self.consumption_check():
                    self.idle_frames = 0
                    self.spawn_food()
                    # finally we grow the snake as well by adding a new segment to the snake's body
                    self.agent.body.grow()

        average_score = total_score / game_iterations

        data_prints = [
            "Neural Network played {}\n".format(dataset_games),
            "Highest Score in {} games: {}\n".format(game_iterations,
                                                     high_score),
            "Average Score in {} games: {}\n".format(game_iterations,
                                                     average_score)
        ]

        with open("test-result.txt", "a") as myfile:
            for prints in data_prints:
                myfile.write(prints)
                print(prints)

    def game_loop(self):
        game_iterations = 5
        high_score = 0

        for _ in range(game_iterations):
            while True:
                pygame.event.pump()

                self.screen.fill(self.background_color)

                score = self.agent.body.score
                self.display_info(score, high_score)

                for food in self.food_stack:
                    food.draw(self.screen)

                self.use_brain_to_move()

                end = self.agent.body.draw(self.screen,
                                           self.go_through_boundary)

                #if snake doesnt do anything or the snake died then kill the game
                if end:
                    # when the snake dies
                    print("Died after turning its head -> {}".format(
                        self.agent.body.current_direction))
                    time.sleep(1)
                    if score > high_score:
                        high_score = score
                    # break the loop when the game ends
                    self.agent.create_new_body()
                    break

                # check here if the snake ate the food
                if self.consumption_check():
                    self.idle_frames = 0
                    self.spawn_food()
                    # finally we grow the snake as well by adding a new segment to the snake's body
                    self.agent.body.grow()

                pygame.display.flip()

                time.sleep(0.05)

        print("High score -> {}".format(high_score))
Esempio n. 16
0
    return np.random.random() * x - x / 2.


def randomtarget():
    geometry = {
        'coordinates': [[rnd(), rnd()], [rnd(), rnd()]],
        'orientations': [rnd(2 * np.pi)] * 2
    }
    geometry['coordinates'] = map(np.array, geometry['coordinates'])

    return ShapeByGeometry(geometry)


############### AGENTS
agents_pose = [Pose(0, 0), Pose(0, -2, -np.pi / 2)]
agents = [RLAgent(i, pose=agents_pose[i]) for i in range(NUM_AGENTS)]
eval_agents = [RLAgent(i, pose=agents_pose[i]) for i in range(NUM_AGENTS)]

############### ENVIRONMENT
env = FormationEnvironment(targetshape,
                           agents,
                           num_iterations=HP.NUM_ITERATIONS,
                           dt=HP.DT)
eval_env = FormationEnvironment(targetshape,
                                eval_agents,
                                num_iterations=HP.NUM_ITERATIONS,
                                dt=HP.DT)

############### PARTIALLY OBSERVED ENVS
agent_observed_envs = {}
for agent in env.agents.values():
Esempio n. 17
0
    def run(self):
        ####loop thru agents
        print('LEARNER neural networks')
        agent_networks = gen_neural_networks([tsc for tsc in self.agent_ids], self.net_data, self.args.hact, self.args.oact, self.args.lr, self.args.lre)

        ###load weights if we want
        if self.args.load is True:
            agent_weights = load_data('saved_weights.p')
            for tsc in self.agent_ids:
                agent_networks[tsc]['online'].set_weights( agent_weights[tsc] )

        ###send weights to actors
        for tsc in self.agent_ids:
            weights = agent_networks[tsc]['online'].get_weights()
            ###send to actors
            self.rl_stats[tsc]['online'] = weights
            agent_networks[tsc]['target'].set_weights( weights )
        ###ensure all learners have sent weights before starting
        self.barrier.wait()

        ###create rl agents using neural networks
        rl_agents = { tsc:RLAgent(agent_networks[tsc], self.args.eps, self.exp_replay[tsc], self.net_data['tsc'][tsc]['n_green_phases'], self.args.n_steps, self.args.batch, self.args.replay, self.args.gamma) for tsc in self.agent_ids}
        
        ###wait until sufficient exp in replay to start making updates
        self.barrier.wait()

        ###timer for stats
        self.last_update = time.time()
        period = 60

        self.learn_time = time.time()

        if self.args.mode == 'train':
            ###reset n_exp count
            for agent in self.agent_ids:
                self.rl_stats[agent]['n_exp'] = 0

            while not self.finished_learning():                                                
                for tsc in rl_agents:                                                         
                    ###only do batch updates after something has been added to exp replay
                    if self.rl_stats[tsc]['n_exp'] > 0:
                        rl_agents[tsc].train_batch( self.rl_stats[tsc]['max_r'])
                        self.rl_stats[tsc]['n_exp'] -= 1
                        self.rl_stats[tsc]['updates'] += 1
                        ###send online weight to shared dict for actors
                        self.rl_stats[tsc]['online'] = rl_agents[tsc].get_params('online')
                                                                                               
                        ###clip exp replay
                        diff = len(self.exp_replay[tsc]) - self.args.replay
                        if diff > 0:
                            del self.exp_replay[tsc][:diff]
                                                                                               
                        ###update target network to online on regular interval
                        if self.rl_stats[tsc]['updates'] % self.args.target == 0:
                            ###set target to online params
                            rl_agents[tsc].set_params('target', self.rl_stats[tsc]['online'])
                ###try stats
                t = time.time() - self.last_update
                if t > period:
                    print('========= AGENT EXP PROGRESS UPDATE LEARNER '+str(self.idx)+' =====')
                    self.print_stats()
                    self.last_update = time.time()
                    T = time.time()-self.learn_time
                    ###use min progress agent as the ETA estimate p
                    min_progress = min([ self.rl_stats[agent]['updates']/float(self.args.updates) for agent in self.agent_ids])
                    eta = self.ETA( min_progress, T )
                    print('==== ETA seconds: '+str( round(eta, 0) )+' minutes: '+str( round(eta/60.0, 2) )+' hours: '+str( round(eta/3600.0, 2) )+' ====')
                    #print(str(ETA( np.amin([ self.rl_stats[agent]['updates']/float(self.args.updates for agent in self.agent_ids])), T))
        print('...end learner '+str(self.idx))
Esempio n. 18
0
class RLTrafficSignalController(TrafficSignalController):
    ###implements a cycle, fixed uniform phase duration for all green phases
    def __init__(self, _id, tsc_data, conn, args, exp_replay, neural_networks,
                 eps, rl_stats, reward):
        super(RLTrafficSignalController,
              self).__init__(_id, tsc_data, conn, args)
        self.rlagent = RLAgent(neural_networks, eps, exp_replay,
                               tsc_data['n_green_phases'], args.n_steps,
                               args.batch, args.replay, args.gamma)
        ###set intersection to red default
        self.id = _id
        #self.phase_buffer = deque()
        self.exp = {}
        self.current_phase = tsc_data['all_red']
        self.args = args
        self.phase_deque = deque()
        self.state_deque = deque()
        self.acting = False
        self.rl_stats = rl_stats
        self.reward = reward
        self.rewards = []

    def update(self, local_obs):
        ###update state buffer
        state = get_density(local_obs, self.tsc_data['inc_lanes'],
                            self.tsc_data['lane_lengths'], self.args.v_len)
        self.state_deque.append(state)

    def next_phase_and_duration(self, local_obs):
        if len(self.phase_deque) == 0:
            if self.acting == True:
                if self.args.mode == 'train':
                    next_s = self.observe_state()
                    r = self.get_reward()
                    terminal = True if np.sum(
                        self.state_deque[-1]) == 0 else False
                    self.rlagent.store_experience(self.exp['s'], self.exp['a'],
                                                  next_s, r, terminal)
                    self.rl_stats['n_exp'] += 1.0 / self.args.n_steps
                    #if self.rl_stats['n_exp'] % 100 == 0:
                    #    print('exp replay size '+str(self.rl_stats['n_exp']))
                    #    print('updates '+str(self.rl_stats['updates']))

            if len(self.state_deque) == 0 or np.sum(self.state_deque[-1]) == 0:
                ###no vehicle present, default to all red
                self.phase_deque.append((self.tsc_data['all_red'], 1))
                self.acting = False
            else:
                ###observe state
                s = self.observe_state()
                self.exp['s'] = s
                ###get new params before acting
                self.rlagent.set_params('online', self.rl_stats['online'])

                ##take action using rl agent
                s = s[np.newaxis, ...]
                action_idx = self.rlagent.get_action(s)
                self.exp['a'] = action_idx

                ###change action index to green traffic signal phase
                next_green = self.tsc_data['int_to_action'][action_idx]
                self.acting = True

                ###add transition phases for desired duration
                transitions = get_transitions(self.current_phase, next_green)
                for trans in transitions:
                    if 'y' in trans:
                        t = self.yellow_t
                    else:
                        t = self.red_t
                    self.phase_deque.append((trans, t))
                self.phase_deque.append((next_green, self.a_repeat))

        next_phase_and_duration = self.phase_deque.popleft()
        next_phase = next_phase_and_duration[0]
        duration = next_phase_and_duration[1]

        return next_phase, duration

    def observe_state(self):
        traffic_state = np.array(self.state_deque[-1])
        signal_state = np.array(
            self.tsc_data['phase_one_hot'][self.current_phase])
        s = np.concatenate([traffic_state, signal_state])
        #s = s[np.newaxis,...]
        return s

    def update_max_reward(self, r):
        abs_r = np.absolute(r)
        if abs_r > self.rl_stats['max_r']:
            self.rl_stats['max_r'] = abs_r

    def get_reward(self):
        r = -np.sum([self.reward(e) for e in self.tsc_data['inc_edges']])
        #self.rewards.append(r)
        self.update_max_reward(r)
        return r