def preprocess(state): if Config.instance().ENV_TYPE == Config.instance().CUSTOM or Config.instance().ENV_TYPE == Config.instance().HIT_PRACTICE: return preprocess_custom(state) elif Config.instance().ENV_TYPE == Config.instance().ATARI: return preprocess_gym(state) else: raise NotImplementedError
def load_sounds(): if Config.instance().ENABLE_AUDIO: pygame.mixer.init() Pong.sounds = {} Pong.sounds["return"] = pygame.mixer.Sound(Config.instance().AUDIO_DIR + "return.ogg") Pong.sounds["score"] = pygame.mixer.Sound(Config.instance().AUDIO_DIR + "score.ogg") Pong.sounds["bounce"] = pygame.mixer.Sound(Config.instance().AUDIO_DIR + "bounce.ogg")
def get_weight_image(model, neuron=0, layer=0, size=(Config.instance().HEIGHT // 2, Config.instance().WIDTH // 2)): weights = model.get_weights()[layer][:, neuron] # Normalize and scale weights to pixel values weights /= np.max(weights) weights += 1 weights *= 256 image = weights.reshape(size).astype(np.uint8) return image
def act(self, state): """ Infer action from state :param state: ndarray representing game state :return: (action id, confidence vector) """ state = state.reshape([1, state.shape[0]]) self.last_hidden_activation = self.hl_model.predict( state, batch_size=1).squeeze() prob = self.model.predict(state, batch_size=1).flatten() self.last_output = prob action = np.random.choice(self.action_size, 1, p=prob)[0] state_ravel = state.reshape(Config.instance().CUSTOM_STATE_SHAPE) self.last_state = np.rot90(state_ravel, axes=(0, 1), k=1).flatten() return action, prob
def setup_custom(up=True, angle=None, hit_practice=False): default_angle = 90 if up else -90 cfg = Config() cfg.RANDOMIZE_START = False if hit_practice: cfg.ENV_TYPE = hit_practice if angle is not None: cfg.BALL_START_ANGLES = [angle] else: cfg.BALL_START_ANGLES = [default_angle] env = Pong(config=cfg) env.reset() return env
def __init__(self, hit_practice=False, config=None): """ Initialize basic game state :param hit_practice: Trigger training mode with a single paddle and randomly spawned balls See the Ball class's hit_practice method. """ if Pong.sounds is None: Pong.load_sounds() if config is None: config = Config.instance() self.config = config # Holds last raw screen pixels for rendering self.last_screen = None self.hit_practice = hit_practice self.score_bottom = 0 self.score_top = 0 self.bottom = Pong.Paddle("bottom", config=config) if not self.hit_practice else None self.top = Pong.Paddle("top", config=config) self.ball = Pong.Ball(hit_practice=hit_practice, config=config) self.frames = 0
self.state.publish("paddle2/action", {"action": str(action)}) self.state.publish("paddle2/frame", {"frame": current_frame_id}) model_activation = self.agent.get_activation_packet() self.state.publish("ai/activation", model_activation) if len(self.frame_diffs) > 1000: print( f"Frame distribution: mean {np.mean(self.frame_diffs)}, stdev {np.std(self.frame_diffs)} counts {np.unique(self.frame_diffs, return_counts=True)}" ) self.frame_diffs = [] def __init__(self, config, paddle1=True): self.config = config self.paddle1 = paddle1 self.paddle2 = not self.paddle1 self.agent = PGAgent(self.config.CUSTOM_STATE_SIZE, self.config.CUSTOM_ACTION_SIZE) self.agent.load(AIDriver.MODEL) self.state = AISubscriber( self.config, trigger_event=lambda: self.publish_inference()) self.last_frame_id = self.state.frame self.last_tick = time.time() self.frame_diffs = [] self.state.start() if __name__ == "__main__": config = Config.instance() instance = AIDriver(config)
from multiprocessing import Pool from tqdm import tqdm """ This file is the driver for training a new DRL pong model. It brings together the following elements: * The environment simulator (either the custom one found in pong.py or the Atari emulator provided by OpenAI Gym) Both environments are wrapped by the interface in simulator.py * The two agents (some combination of human-controlled, DRL, and hard-coded agents found in player.py) The level of abstraction in this file is pretty high, and it really only exists to further abstract the training process into a few environmental and training hyperparameters that are easy to experiment with and to provide convenient monitoring and graphing of the training process. """ GAME_BATCH = 10 MODE = Config.instance().HIT_PRACTICE # Config.instance().CUSTOM LEARNING_RATE = 0.001 DENSE_STRUCTURE = (200, ) ALWAYS_FOLLOW = False PARALLELIZE = False if __name__ == "__main__": # Ensure directory safety os.makedirs("models/bottom", exist_ok=True) os.makedirs("models/top", exist_ok=True) os.makedirs("analytics", exist_ok=True) os.makedirs("analytics/plots", exist_ok=True) # Initialize for checks & scope start_index = None
def play_sound(sound): if Config.instance().ENABLE_AUDIO and Pong.sounds is not None and sound in Pong.sounds: try: playback = Pong.sounds[sound].play() except Exception as e: print(e)
def random_action(): return choice(Config.instance().ACTIONS)
import numpy as np import cv2 import math import keyboard import time from random import choice, randint from exhibit.shared.config import Config if Config.instance().ENABLE_AUDIO: import pygame.mixer class Pong: """ This class captures all of the game logic for Pong. It was used instead of OpenAI Gym or various other publicly available alternatives in order to allow for complete flexibility. """ sounds = None @staticmethod def read_key(up, down): """ Converts keyboard state to internal action state :param up: key code for "up" control :param down: key code for "down" control :return: Action code: 0 for up, 1 for down, 2 for nothing """ if keyboard.is_pressed(up): return 0 elif keyboard.is_pressed(down):
def simulate_game(config, env_type=Config.instance().CUSTOM, left=None, right=None, batch=1, visualizer=None): """ Wraps both the OpenAI Gym Atari Pong environment and the custom Pong environment in a common interface, useful to test the same training setup against both environments """ env = None state_size = None games_remaining = batch state_shape = config.CUSTOM_STATE_SHAPE if env_type == config.CUSTOM: env = Pong() state_size = config.CUSTOM_STATE_SIZE state_shape = config.CUSTOM_STATE_SHAPE if type(left) == BotPlayer: left.attach_env(env) if type(right) == BotPlayer: right.attach_env(env) elif env_type == config.HIT_PRACTICE: env = Pong(hit_practice=True) state_size = config.CUSTOM_STATE_SIZE state_shape = config.CUSTOM_STATE_SHAPE if type(right) == BotPlayer: right.attach_env(env) # Training data states = [] states_flipped = [] actions_l = [] actions_r = [] rewards_l = [] rewards_r = [] probs_l = [] probs_r = [] # Prepare to collect fun data for visualizations render_states = [] model_states = [] score_l = 0 score_r = 0 last_state = np.zeros(state_shape) state = env.reset() if visualizer is not None: visualizer.base_render(utils.preprocess_custom(state)) i = 0 while True: render_states.append(state.astype(np.uint8)) current_state = utils.preprocess_custom(state) diff_state = current_state - last_state model_states.append(diff_state.astype(np.uint8)) diff_state_rev = np.flip(diff_state, axis=1) last_state = current_state action_l, prob_l, action_r, prob_r = None, None, None, None x = diff_state.ravel() x_flip = diff_state_rev.ravel() if left is not None: action_l, prob_l = left.act(x_flip) if right is not None: action_r, prob_r = right.act(x) states.append(x) state, reward, done = None, None, None if env_type == config.HIT_PRACTICE: state, reward, done = env.step(None, config.ACTIONS[action_r], frames=config.AI_FRAME_INTERVAL) else: state, reward, done = env.step(config.ACTIONS[action_l], config.ACTIONS[action_r], frames=config.AI_FRAME_INTERVAL) reward_l = float(reward[0]) reward_r = float(reward[1]) # Save observations probs_l.append(prob_l) probs_r.append(prob_r) actions_l.append(action_l) actions_r.append(action_r) rewards_l.append(reward_l) rewards_r.append(reward_r) if reward_r < 0: score_l -= reward_r if reward_r > 0: score_r += reward_r if done: games_remaining -= 1 print('Score: %f - %f.' % (score_l, score_r)) utils.write(f'{score_l},{score_r}', f'analytics/scores.csv') if games_remaining == 0: metadata = (render_states, model_states, (score_l, score_r)) return states, (actions_l, probs_l, rewards_l), (actions_r, probs_r, rewards_r), metadata else: score_l, score_r = 0, 0 state = env.reset() i += 1