def __init__(self, rand_seed, rom, display=False, frame_skip=4, no_op_max=7): self.ale = ale_python_interface.ALEInterface() self.ale.setInt(b'random_seed', rand_seed) self.ale.setFloat(b'repeat_action_probability', 0.0) self.ale.setBool(b'color_averaging', True) self.ale.setInt(b'frame_skip', frame_skip) self._no_op_max = no_op_max if display: self._setup_display() self.ale.loadROM(rom.encode('ascii')) # collect minimal action set self.real_actions = self.ale.getMinimalActionSet() # height=210, width=160 self._screen = np.empty((210, 160, 1), dtype=np.uint8) self.reset()
def __init__(self, opt): self.envType = opt.environment self.show = opt.show self.delay = opt.delay self.width = opt.width self.height = opt.height self.noopMax = opt.noopMax if self.show is True: cv2.namedWindow("show") if self.envType == "ALE": import ale_python_interface self.env = ale_python_interface.ALEInterface() if opt.randomSeed == None: self.env.setInt('random_seed', 0) else: self.env.setInt('random_seed', opt.randomSeed) self.env.setInt("frame_skip", opt.frameSkip) self.env.setBool('color_averaging', opt.colorAverageing) self.env.setInt('phosphor_blend_ratio', opt.phosphorBlendRatio) self.env.setInt('max_num_frames_per_episode', opt.maxNumFramesPerEpisode) # print self.env.getInt('phosphor_blend_ratio') self.env.setBool('sound', False) self.env.setBool('display_screen', False) self.env.setFloat("repeat_action_probability", 0.0) self.env.loadROM(opt.pathRom) self.legal_actions = self.env.getMinimalActionSet() self.n_actions = len(self.legal_actions)
def main(): ale = ale_python_interface.ALEInterface() ale.loadROM('aleroms/breakout.bin') actions = ale.getMinimalActionSet() params = shared.bindNew() print 'building main model...' model = make_model(len(actions)) train, get_pol, get_v, get_grads, update_from_grads, lr, beta = make_funcs( model, params) global_params = [mparray(p.get_value()) for p in params] for gp, p in zip(global_params, params): p.set_value(gp[3], borrow=True) nthreads = 4 thread_grads = [[mparray_zero(p.get_value().shape) for p in params] for i in range(nthreads)] thread_counters = mparray_zero((nthreads, )) counters = mpasnp(thread_counters) running = mparray_zero((1, )) running[3][0] += nthreads print 'starting threads' manager = mp.Manager() rewards = manager.list() for i in range(nthreads): p = mp.Process(target=thread_learner, args=(i, global_params, thread_grads[i], thread_counters, running, rewards)) p.start() mp.Process(target=plot_stuff, args=(rewards, running)).start() try: last_draw = time.time() upds = numpy.zeros(nthreads) cnts = numpy.zeros(nthreads) while running[3][0] > 0: for i in range(nthreads): if counters[i] > 0: with thread_grads[i][0][0].get_lock(): grads = [g[3] / counters[i] for g in thread_grads[i]] update_from_grads(*grads) for g in thread_grads[i]: g[3][:] *= 0 upds[i] += counters[i] cnts[i] += 1 counters[i] = 0 if time.time() - last_draw > 2: print 'updates:', numpy.mean( upds / cnts), numpy.mean(upds), running[3] upds = numpy.zeros(nthreads) cnts = numpy.zeros(nthreads) last_draw = time.time() finally: running[3][0] = 0
def initialize(pid, device, flags, comm, share_comm): message = 'initialize process: {:d} with GPU: {} game: {}'.format( pid, device, flags.rom) comm.send([-1, 'print', message], dest=flags.threads) import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = device[-1] np.random.seed(flags.seed) tf.set_random_seed(flags.seed) try: import ale_python_interface except ImportError: import atari_py.ale_python_interface as ale_python_interface # initialize ALE environment if flags.rom.endswith('.bin'): rom = flags.rom else: rom = "%s.bin" % flags.rom full_rom_path = os.path.join(flags.roms_path, rom) ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', flags.seed) ale.setBool('sound', False) ale.setBool('display_screen', False) ale.setFloat('repeat_action_probability', flags.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) # adjust flags flags.num_actions = num_actions flags.logs_path = os.path.join(flags.logs_path, '#' + str(pid) + '_' + flags.rom) tf.gfile.MakeDirs(flags.logs_path) # print settings setting_file = open(os.path.join(flags.logs_path, 'flags.txt'), mode='w+') for key, item in flags.__flags.items(): setting_file.write(key + ' : ' + str(item) + '\n') # initialize agent if flags.ot: network = neural_networks.OptimalityTighteningNetwork( pid, flags, device, share_comm) else: network = neural_networks.DeepQNetwork(pid, flags, device, share_comm) setting_file.write(network.nn_structure_file) setting_file.close() if flags.ot: agent = agents.OptimalityTigheningAgent(pid, network, flags, comm, share_comm) else: agent = agents.QLearning(pid, network, flags, comm, share_comm) interaction.Interaction(pid, ale, agent, flags, comm).start()
def record_new(rom, output, frames, episodes, seed, snapshot_interval): rom_name = os.path.splitext(os.path.split(rom)[-1])[0] pygame.init() ale = ALE.ALEInterface() ale.setInt('random_seed', seed) ale.setFloat('repeat_action_probability', 0) ale.setBool('color_averaging', False) ale.setBool('display_screen', True) ale.loadROM(rom) demo = Demonstration(rom=rom_name, action_set=ale.getMinimalActionSet()) record(ale, demo, output, frames, episodes, snapshot_interval)
def resume(partial_demo, rom, frames, episodes, snapshot_interval): pygame.init() demo = Demonstration.load(partial_demo) ale = ALE.ALEInterface() ale.setFloat('repeat_action_probability', 0) ale.setBool('color_averaging', False) ale.setBool('display_screen', True) ale.loadROM(rom) # restore snapshot from original recording + begin new episode # n.b. needed to preserve state from the original recording, like the seed demo.reset_to_latest_snapshot(ale) ale.reset_game() record(ale, demo, partial_demo, frames, episodes, snapshot_interval)
def _setup_ale(self, rom, display_game_screen): ''' Args: rom (str): path to Atari rom. display_game_screen (bool): true iff the Atari screen should be displayed. Summary: Initializes the Atari Learning Environment. ''' # Create ALE Interface and set the seed. self.ale = ale_interface.ALEInterface() self.ale.setInt('random_seed', 123) # Setup visuals and sound depending on os. if display_game_screen: if sys.platform == 'darwin': import pygame pygame.init() self.ale.setBool('sound', False) # Sound doesn't work on OSX elif sys.platform.startswith('linux'): self.ale.setBool('sound', True) self.ale.setBool('display_screen', True) # Load the ROM. self.rom = rom.replace(".bin", "") full_rom_path = os.path.dirname( os.path.realpath(__file__)) + "/roms/" + self.rom + ".bin" self.ale.loadROM(full_rom_path) # Grab game details. self.ram_size = self.ale.getRAMSize() self.screen_width, self.screen_height = self.ale.getScreenDims() self.actions = self.ale.getLegalActionSet() # Get Ram, initial image info. ram = numpy.zeros((self.ram_size), dtype=numpy.uint8) ram_state = self.ale.getRAM(ram) screen_data = self.ale.getScreenGrayscale( ) if self.grayscale else self.ale.getScreenRGB() # Make initial state. self.init_state = AtariState(screen_data, ram_state, objects_from_image=self.image_to_object)
def __init__(self, rom, outSize): self._ale = ALE.ALEInterface() self._ale.setInt("random_seed".encode(), int(time.time())) self._ale.setFloat("repeat_action_probability".encode(), 0) self._ale.setBool("color_averaging".encode(), True) self._ale.loadROM(rom.encode()) d = self._ale.getScreenDims() ## The size of the screen in the form [height, width] self.screenSize = [d[1], d[0]] ## The size of the images returned by getScreen the form # [height, width] self.outSize = outSize self._RAWScreen = np.empty([d[0] * d[1]], dtype=np.uint8) self._RAWScaled = np.empty(self.outSize, dtype=np.uint8) self._RGBScreen = np.empty([d[1], d[0], 3], dtype=np.uint8) t = time.localtime() self._creaTime = str(t.tm_year) + "-" + str(t.tm_mon) + \ "-" + str(t.tm_mday) + \ "-" + str(t.tm_hour) + \ "." + str(t.tm_min) + \ "." + str(t.tm_sec)
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) agent = None if parameters.use_episodic_control: if parameters.qec_table is None: qec_table = EC_functions.QECTable( parameters.knn, parameters.state_dimension, parameters.projection_type, defaults.RESIZED_WIDTH * defaults.RESIZED_HEIGHT, parameters.buffer_size, num_actions, rng) else: handle = open(parameters.qec_table, 'r') qec_table = pickle.load(handle) agent = EC_agent.EpisodicControl(qec_table, parameters.ec_discount, num_actions, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.experiment_prefix, rng) experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) experiment.run()
def start_training(params): """ Initialize rom, game, agent, network and start a training run """ # CREATE A FOLDER TO HOLD RESULTS exp_pref = "../results/" + params.EXPERIMENT_PREFIX time_str = time.strftime("_%m-%d-%H-%M_", time.gmtime()) exp_dir = exp_pref + time_str + \ "{}".format(params.LEARNING_RATE).replace(".", "p") + "_" \ + "{}".format(params.DISCOUNT).replace(".", "p") try: os.stat(exp_dir) except OSError: os.makedirs(exp_dir) logger = logging.getLogger("DeepLogger") logger.setLevel(logging.INFO) # Logging filehandler #fh = logging.FileHandler(exp_dir + "/log.log") # Rotate file when filesize is 5 mb fh = RotatingFileHandler(exp_dir + "/log.log", maxBytes=5000000, backupCount=100) fh.setLevel(logging.INFO) # Console filehandler ch = logging.StreamHandler() ch.setLevel(logging.INFO) formatter = logging.Formatter('%(message)s') fh.setFormatter(formatter) ch.setFormatter(formatter) logger.addHandler(fh) # Prevent nohup from producing large log file, logging to file is handled internally # logger.addHandler(ch) log_params(logger, params) #logging.basicConfig(level=logging.INFO, filename=exp_dir + "/log.log") if params.DETERMINISTIC: rng = np.random.RandomState(12345) else: rng = np.random.RandomState() if params.CUDNN_DETERMINISTIC: theano.config.dnn.conv.algo_bwd = 'deterministic' # Init ale ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', 123) ale.setBool('display_screen', params.DISPLAY_SCREEN) ale.setFloat('repeat_action_probability', params.REPEAT_ACTION_PROBABILITY) full_rom_path = os.path.join(params.ROM_PATH, params.ROM_NAME) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) print "Legal actions: ", num_actions print ale.getMinimalActionSet() # Instantiate network logger.info("Setting up network...") network = None # Be able to continue training from a network or watch a network play if (params.NETWORK_PICKLE_FILE is None): logger.info("Initializing a new random network...") network = q_network.DeepQLearner( params.RESIZED_WIDTH, params.RESIZED_HEIGHT, num_actions, params.PHI_LENGTH, params.DISCOUNT, params.LEARNING_RATE, params.RMS_DECAY, params.RMS_EPSILON, params.MOMENTUM, params.CLIP_DELTA, params.FREEZE_INTERVAL, params.BATCH_SIZE, params.NETWORK_TYPE, params.UPDATE_RULE, params.BATCH_ACCUMULATOR, rng) else: logger.info("Loading network instance from file...") handle = open(params.NETWORK_PICKLE_FILE, 'r') network = cPickle.load(handle) # Only used when getting a random network if params.RANDOM_NETWORK_PICKLE: import sys sys.setrecursionlimit(10000) result_net_file = open(params.EXPERIMENT_PREFIX + '.pkl', 'w') print "File opened" cPickle.dump(network, result_net_file, -1) print "Pickle dumped" result_net_file.close() sys.exit(0) # Instatiate agent logger.info("Setting up agent...") agent = ale_agent.NeuralAgent(network, params.EPSILON_START, params.EPSILON_MIN, params.EPSILON_DECAY, params.REPLAY_MEMORY_SIZE, exp_dir, params.REPLAY_START_SIZE, params.UPDATE_FREQUENCY, rng) # Instantiate experient logger.info("Setting up experiment...") experiment = ale_experiment.ALEExperiment( ale, agent, params.RESIZED_WIDTH, params.RESIZED_HEIGHT, params.RESIZE_METHOD, params.EPOCHS, params.STEPS_PER_EPOCH, params.STEPS_PER_TEST, params.FRAME_SKIP, params.DEATH_ENDS_EPISODE, params.MAX_START_NULLOPS, rng) # Run experiment logger.info("Running experiment...") experiment.run()
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) # dump parameters for replication time_str = time.strftime("%Y-%m-%d_%H-%M_", time.localtime()) exp_dir = time_str + parameters.experiment_prefix exp_dir = os.path.join("results", exp_dir) if not os.path.isdir(exp_dir): os.makedirs(exp_dir) parameter_file = open(os.path.join(exp_dir, 'parameter.txt'), 'w', 0) parameter_file.write(str(parameters)) parameter_file.flush() parameter_file.close() if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) avail_actions = ale.getMinimalActionSet() if parameters.train_all: num_actions = len(ale.getLegalActionSet()) else: num_actions = len(avail_actions) print "avail_actions: " + str(avail_actions) print "num_actions: " + str(num_actions) if parameters.nn_file is None: network = q_network.DeepQLearner( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, avail_actions, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng, parameters.train_all) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ale_agent.NeuralAgent( network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng, exp_dir, parameters.train_all) experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) experiment.run()
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) # FOR VISUALIZATION USE_SDL = False if parameters.display_screen: if USE_SDL: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) agent = None if parameters.use_episodic_control: if parameters.qec_table is None: qec_table = EC_functions.QECTable( parameters.knn, parameters.state_dimension, parameters.projection_type, defaults.RESIZED_WIDTH * defaults.RESIZED_HEIGHT, parameters.buffer_size, num_actions, rng) else: handle = open(parameters.qec_table, 'r') qec_table = cPickle.load(handle) #If this doesnt work load using the function below # def try_to_load_as_pickled_object_or_None(filepath): # """ # This is a defensive way to write pickle.load, allowing for very large files on all platforms # """ # max_bytes = 2 ** 31 - 1 # try: # input_size = os.path.getsize(filepath) # bytes_in = bytearray(0) # with open(filepath, 'rb') as f_in: # for _ in range(0, input_size, max_bytes): # bytes_in += f_in.read(max_bytes) # obj = cPickle.loads(bytes_in) # except: # return None # return obj # qec_table = try_to_load_as_pickled_object_or_None(handle) agent = IBL_agent.EpisodicControl(qec_table, parameters.ec_discount, num_actions, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.experiment_prefix, rng) experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) experiment.run()
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) if parameters.nn_file is None: print 'building network...' network = q_network.DeepQLearner( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: print 'loading network...' handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) print 'building agent...' if parameters.aws_secret_key and parameters.aws_access_key and parameters.s3_bucket: s3_utility = S3Utility(parameters.aws_access_key, parameters.aws_secret_key, parameters.s3_bucket) else: s3_utility = None agent = ale_agent.NeuralAgent( network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng, s3_utility) print 'building experiment...' experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) print 'running experiment...' experiment.run()
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ## Here !!! if isinstance(parameters.record_screen_dir, str): if len(parameters.record_screen_dir): ale.setString('record_screen_dir', parameters.record_screen_dir) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) if parameters.nn_file is None: network = q_network.DeepQLearner( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: with open(parameters.nn_file, 'r') as handle: network = cPickle.load(handle) agent = ale_agent.NeuralAgent( network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng) ## Get the coach: let it have read/write access to the agent's databanks coach = None if parameters.nn_coach_file is not None: with open(parameters.nn_coach_file, 'r') as handle: network = cPickle.load(handle) coach = ale_coach.NeuralCoach(network, agent.get_training_dataset(), parameters.coach_epsilon, rng) experiment = ale_experiment.ALEExperiment(ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng, coach=coach) experiment.run()
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) # Load VAE file with open(parameters.vae_aux_file, 'r') as f: aux_data = pickle.load(f) f.close() params = aux_data['params'] with tf.device(None): #"/gpu:0"): config = tf.ConfigProto() config.gpu_options.allow_growth = True # sess = tf.Session(config=config) # ) sess = tf.Session(config=config) VAE = vae.vae(params) VAE._create_network_() try: sess.run(tf.global_variables_initializer()) except AttributeError: sess.run(tf.initialize_all_variables()) saver = tf.train.Saver() import theano import ale_experiment import ale_agent import q_network if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' if parameters.nn_file is None: network = q_network.DeepQLearner( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ale_agent.NeuralAgent( network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng) experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_WIDTH, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng, VAE, sess, defaults.VAE_REQ_STEPS, defaults.VAE_STORAGE_SIZE) time_str = time.strftime("%m-%d-%H-%M", time.gmtime()) vae_save_path = '%s/%s_beta%f_z%d' % (defaults.VAE_OUT_PREFIX, rom.split('.')[0], params['beta'], params['z_size']) os.system('mkdir -p %s' % (vae_save_path)) experiment.run() ckpt_path = '%s/%s_%s' % (vae_save_path, rom.split('.')[0], time_str) print ckpt_path if not os.path.exists(ckpt_path): os.makedirs(ckpt_path) saver.save(sess, '%s/checkpoint.ckpt' % (ckpt_path))
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(parameters.Seed) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd_data = 'deterministic' theano.config.dnn.conv.algo_bwd_filter = 'deterministic' ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) agent = None if not parameters.close2: print 'transition length is ', parameters.transition_length, 'transition range is', parameters.transition_range if parameters.method == 'ot': if parameters.nn_file is None: network = q_network.DeepQLearner( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng, double=parameters.double_dqn, transition_length=parameters.transition_length) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ale_agents.OptimalityTightening( network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.update_frequency, parameters.replay_start_size, rng, parameters.transition_length, parameters.transition_range, parameters.penalty_method, parameters.weight_min, parameters.weight_max, parameters.annealing_len, parameters.beta, parameters.two_train, parameters.late2, parameters.close2, parameters.verbose, parameters.double_dqn, parameters.save_pkl) experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng, parameters.flickering_buffer_size) experiment.run()
import ale_python_interface import numpy as np # Init ale ale = ale_python_interface.ALEInterface() #ale.setInt('random_seed', 123) TODO: find out what this is ale.setBool('display_screen', True) ale.setFloat('repeat_action_probability', 0) full_rom_path = "../roms/breakout.bin" ale.loadROM(full_rom_path) # testing ale legal_actions = ale.getMinimalActionSet() print(legal_actions) total_reward = 0 repeat_action = 20 a_count = 0 index = 0 while not ale.game_over(): if a_count > repeat_action: index = (index + 1) % len(legal_actions) a_count = 0 a = legal_actions[index] print "Action: ", a
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) # Load VAE file with open(parameters.vae_aux_file, 'r') as f: aux_data = pickle.load(f) f.close() params = aux_data['params'] with tf.device(None): #"/gpu:0"): config = tf.ConfigProto() config.gpu_options.allow_growth = True # sess = tf.Session(config=config) # config = tf.ConfigProto( # device_count={'GPU': 0} # ) sess = tf.Session(config=config) VAE = vae.vae(params) VAE._create_network_() try: sess.run(tf.global_variables_initializer()) except AttributeError: sess.run(tf.initialize_all_variables()) saver = tf.train.Saver() chkpt = tf.train.get_checkpoint_state(parameters.vae_file) if chkpt and chkpt.model_checkpoint_path: saver.restore(sess, chkpt.model_checkpoint_path) else: print 'No checkpoint found' import theano import ale_experiment import ale_agent import q_network if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' if parameters.nn_file is None: network = q_network.DeepQLearner( VAE.z_size, 1, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ale_agent.NeuralAgent( network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng) experiment = ale_experiment.ALEExperiment( ale, agent, VAE.X_size[1], VAE.X_size[0], parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng, VAE, sess) experiment.run()
def main(): ale = ale_python_interface.ALEInterface() ale.loadROM('aleroms/pong.bin') actions = ale.getMinimalActionSet() params = shared.bindNew() model = make_model(len(actions)) x = T.tensor3() r = T.scalar() a = T.iscalar() lr = theano.shared(numpy.float32(0.00025)) out = model(x.dimshuffle('x', 0, 1, 2)) v = out[0, -1] pol = T.nnet.softmax(out[:, :-1])[0] A = r - v logpi = T.log(pol[a]) actor_loss = logpi * theano.gradient.disconnected_grad(A) critic_loss = A**2 loss = T.mean(actor_loss + critic_loss) updates = rmsprop(0.999)(params, T.grad(loss, params), lr) train = theano.function([x, a, r], [critic_loss, actor_loss], updates=updates) get_pol = theano.function([x], pol) get_v = theano.function([x], v) print 'compiled functions' def getImg(): x = ale.getScreenGrayscale() return numpy.float32(scipy.misc.imresize(x[:, :, 0], (84, 84)) / 255.) ale.reset_game() x = [getImg()] * 4 t_max = 5 gamma = 0.99 loss = 0 t = 0 tot_r = 0 rs = [] pp.ion() pp.show() for i in range(100000): traj = [] for j in range(t_max): t += 1 x = x[1:] + [getImg()] pol = get_pol(x) a = numpy.int32(numpy.argmax(numpy.random.multinomial(1, pol))) r = 0 for _ in range(4): r += numpy.float32(ale.act(actions[a])) tot_r += r traj.append([x, a, r]) if ale.game_over(): break R = 0 if ale.game_over() else get_v(traj[-1][0]) for x, a, r in traj[:-1][::-1]: R = r + gamma * R loss += train(x, a, numpy.float32(R))[0] if ale.game_over(): rs.append(tot_r) print i, t, loss, tot_r print pol ale.reset_game() x = [getImg()] * 4 loss = 0 t = 0 tot_r = 0 pp.clf() if len(rs) < 200: pp.plot(rs) else: try: plotmeans(numpy.float32(rs)) except Exception, e: print e #pp.show(block=False) pp.draw() pp.pause(0.001)
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) agent = None if parameters.method == 'ec_dqn': if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng, use_ec=True, double=parameters.double_dqn) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) if parameters.qec_table is None: qec_table = EC_functions.QECTable(parameters.knn, parameters.state_dimension, parameters.projection_type, defaults.RESIZED_WIDTH*defaults.RESIZED_HEIGHT, parameters.buffer_size, num_actions, rng, parameters.rebuild_knn_frequency) else: handle = open(parameters.qec_table, 'r') qec_table = cPickle.load(handle) agent = ale_agents.EC_DQN(network, qec_table, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, parameters.ec_discount, num_actions, parameters.ec_testing, rng) if parameters.method == 'dqn_episodic_memory1': if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng, use_episodic_mem=True, double=parameters.double_dqn) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) if parameters.qec_table is None: qec_table = EC_functions.QECTable(parameters.knn, parameters.state_dimension, parameters.projection_type, defaults.RESIZED_WIDTH*defaults.RESIZED_HEIGHT, parameters.buffer_size, num_actions, rng, parameters.rebuild_knn_frequency) else: handle = open(parameters.qec_table, 'r') qec_table = cPickle.load(handle) agent = ale_agents.NeuralNetworkEpisodicMemory1(network, qec_table, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, parameters.ec_discount, num_actions, parameters.ec_testing, rng) if parameters.method == 'dqn_episodic_memory2': if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng, use_episodic_mem=True, double=parameters.double_dqn) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) if parameters.qec_table is None: qec_table = EC_functions.QECTable(parameters.knn, parameters.state_dimension, parameters.projection_type, defaults.RESIZED_WIDTH*defaults.RESIZED_HEIGHT, parameters.buffer_size, num_actions, rng, parameters.rebuild_knn_frequency) else: handle = open(parameters.qec_table, 'r') qec_table = cPickle.load(handle) if parameters.method == 'dqn_episodic_memory3': if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng, use_episodic_mem=True, double=parameters.double_dqn) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) if parameters.qec_table is None: qec_table = EC_functions.LshHash(parameters.state_dimension, defaults.RESIZED_WIDTH*defaults.RESIZED_HEIGHT, parameters.buffer_size, rng) else: handle = open(parameters.qec_table, 'r') qec_table = cPickle.load(handle) agent = ale_agents.NeuralNetworkEpisodicMemory3(network, qec_table, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, parameters.ec_discount, num_actions, parameters.ec_testing, rng) if parameters.method == 'dqn': if parameters.nn_file is None: network = q_network.DeepQLearner(defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng, double=parameters.double_dqn) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ale_agents.NeuralAgent(network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng) if parameters.method == 'episodic_control': if parameters.qec_table is None: qec_table = EC_functions.QECTable(parameters.knn, parameters.state_dimension, parameters.projection_type, defaults.RESIZED_WIDTH*defaults.RESIZED_HEIGHT, parameters.buffer_size, num_actions, rng, parameters.rebuild_knn_frequency) else: handle = open(parameters.qec_table, 'r') qec_table = cPickle.load(handle) agent = ale_agents.EpisodicControl(qec_table, parameters.ec_discount, num_actions, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.experiment_prefix, parameters.ec_testing, rng) experiment = ale_experiment.ALEExperiment(ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) experiment.run()
def launch(args, defaults, description): """ Execute a complete training run. """ time_str = time.strftime("_%m-%d-%H-%M_", time.localtime()) logging.basicConfig(filename='log' + time_str + '.txt', level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: import sys if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) ###################################################### # Daniel: This is where I insert human-guided stuff. # ###################################################### # Logic to deal with loading a separate network trained on human data. # Must also address mapping from human net (0,1,2,...) to ALE. # I know that, for Breakout, my {0,1,2} correspond to {NOOP,LEFT,RIGHT}. # But how should these get mapped to ALE actions? I know 0=noop, 1=fire. # Keep in mind that there's a SECOND mapping that happens after this! map_action_index = None human_net = None human_experience_replay = None if parameters.use_human_net: if (rom == 'breakout' or rom == 'breakout.bin'): # This maps the action indices from the net (0,1,2,...) into a # **second** mapping [0 1 3 4], which is game-independent, so the # main work is to set map_action_index. # Thus, 0 ==> 0 ==> 0 (NOOP) # Thus, 1 ==> 3 ==> 4 (LEFT) # Thus, 2 ==> 2 ==> 3 (RIGHT) # (The net doesn't use FIRE.) map_action_index = {0: 0, 1: 3, 2: 2} elif (rom == 'space_invaders' or rom == 'space_invaders.bin'): # Second mapping is [0 1 3 4 11 12] E.g., 4 is FLEFT in my data, # needs to be mapped to index 5 so it results in '12'. map_action_index = {0: 0, 1: 1, 2: 3, 3: 2, 4: 5, 5: 4} else: raise ValueError("rom={} doesn't have action mapping".format(rom)) # Let's make the human net; #actions = len(map_action_index). human_net = human_q_net.HumanQNetwork( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, len(map_action_index), parameters.phi_length, parameters.batch_size, parameters.network_type, parameters.human_net_path, map_action_index) if parameters.use_human_experience_replay: if (rom == 'breakout' or rom == 'breakout.bin'): human_experience_replay = np.load( parameters.human_experience_replay_path) else: raise ValueError("rom={} doesn't have xp replay".format(rom)) ########################### # Daniel: Back to normal. # ########################### if parameters.nn_file is None: network = q_network.DeepQLearner( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: handle = open(parameters.nn_file, 'rb') network = cPickle.load(handle) agent = ale_agent.NeuralAgent( network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, parameters.experiment_prefix, parameters.replay_start_size, parameters.update_frequency, rng, parameters.epochs, parameters.use_human_net, parameters.use_human_experience_replay, human_net, human_experience_replay) experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng) experiment.run()
def thread_learner_(thread_idx, global_params, t_grads, t_counters, running, rs): numpy.random.seed(thread_idx) thread_counters = mpasnp(t_counters) thread_grads = map(mpasnp, t_grads) ale = ale_python_interface.ALEInterface() ale.loadROM('aleroms/breakout.bin') #ale.setInt('frame_skip', 4) ale.setFloat('repeat_action_probability', 0) ale.setBool('color_averaging', False) actions = ale.getMinimalActionSet() params = shared.bindNew() model = make_model(len(actions)) train, get_pol, get_v, get_grads, update_from_grads, lr, beta = make_funcs( model, params) for gp, p in zip(global_params, params): p.set_value(mpasnp(gp), borrow=True) print 'compiled functions' def getImg(): x = ale.getScreenGrayscale() return numpy.float32(scipy.misc.imresize(x[:, :, 0], (84, 84)) / 255.) ale.reset_game() x = [getImg()] * 4 t_max = 5 gamma = 0.99 loss = 0 t = 0 tot_r = 0 #rs = [] #pp.ion() #pp.show() t1000 = time.time() frame_0 = ale.getEpisodeFrameNumber() for i in range(1000000): if running[3][0] < 1: break traj = [] for j in range(t_max): t += 1 x = x[1:] + [getImg()] pol = get_pol(x) a = numpy.int32(numpy.argmax(numpy.random.multinomial(1, pol))) r = 0 for _ in range(4): r += numpy.float32(ale.act(actions[a])) tot_r += r traj.append([x, a, r]) if ale.game_over(): break R = 0 if ale.game_over() else get_v(traj[-1][0]) for x, a, r in traj[:-1][::-1]: R = r + gamma * R # loss += train(x,a,numpy.float32(R))[0] gs = get_grads(x, a, numpy.float32(R)) #print 'pushing grads', thread_idx, thread_counters t0 = time.time() with t_grads[0][0].get_lock(): #print ' ',time.time()-t0,thread_idx for g, tg in zip(gs, thread_grads): tg += g thread_counters[thread_idx] += 1 if ale.game_over(): t1001 = time.time() print 'FPS:', ale.getEpisodeFrameNumber() / (t1001 - t1000) print ' ', ale.getEpisodeFrameNumber() print ' ', thread_idx t1000 = time.time() beta.set_value(numpy.float32(beta.get_value() * 0.99)) rs.append(tot_r) print i, t, loss, tot_r print pol, beta.get_value() ale.reset_game() x = [getImg()] * 4 loss = 0 t = 0 tot_r = 0 if 0: pp.clf() if len(rs) < 200: pp.plot(rs) else: plotmeans(numpy.float32(rs)) #pp.show(block=False) pp.draw() pp.pause(0.001)
def launch(args, defaults, description): """ Execute a complete training run. """ logging.basicConfig(level=logging.INFO) parameters = process_args(args, defaults, description) if parameters.rom.endswith('.bin'): rom = parameters.rom else: rom = "%s.bin" % parameters.rom full_rom_path = os.path.join(defaults.BASE_ROM_PATH, rom) try: mode = int(parameters.mode) except ValueError: mode = 1 if parameters.deterministic: rng = np.random.RandomState(123456) else: rng = np.random.RandomState() if parameters.cudnn_deterministic: theano.config.dnn.conv.algo_bwd = 'deterministic' if parameters.experiment_directory: experiment_directory = parameters.experiment_directory else: time_str = time.strftime("_%Y-%m-%d-%H-%M") experiment_directory = parameters.experiment_prefix + time_str \ + '_mode_' + str(mode) ale = ale_python_interface.ALEInterface() ale.setInt('random_seed', rng.randint(1000)) if parameters.display_screen: if sys.platform == 'darwin': import pygame pygame.init() ale.setBool('sound', False) # Sound doesn't work on OSX ale.setBool('display_screen', parameters.display_screen) ale.setFloat('repeat_action_probability', parameters.repeat_action_probability) if parameters.record_video: video_directory = os.path.join(experiment_directory, 'video') if not os.path.isdir(video_directory): os.makedirs(video_directory) ale.setString('record_screen_dir', video_directory) if sys.platform != 'darwin': ale.setBool('sound', True) ale.setString("record_sound_filename", os.path.join(video_directory, "sound.wav")) # "We set fragsize to 64 to ensure proper sound sync" # (that's what videoRecordingExample.cpp in ALE says. I don't really know what it means) ale.setInt("fragsize", 64) ale.loadROM(full_rom_path) num_actions = len(ale.getMinimalActionSet()) ale.setMode(mode) if parameters.nn_file is None: network = q_network.DeepQLearner( defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, num_actions, parameters.phi_length, parameters.discount, parameters.learning_rate, parameters.rms_decay, parameters.rms_epsilon, parameters.momentum, parameters.clip_delta, parameters.freeze_interval, parameters.use_double, parameters.batch_size, parameters.network_type, parameters.update_rule, parameters.batch_accumulator, rng) else: handle = open(parameters.nn_file, 'r') network = cPickle.load(handle) agent = ale_agent.NeuralAgent(network, parameters.epsilon_start, parameters.epsilon_min, parameters.epsilon_decay, parameters.replay_memory_size, experiment_directory, parameters.replay_start_size, parameters.update_frequency, rng, recording=parameters.recording) experiment = ale_experiment.ALEExperiment( ale, agent, defaults.RESIZED_WIDTH, defaults.RESIZED_HEIGHT, parameters.resize_method, parameters.epochs, parameters.steps_per_epoch, parameters.steps_per_test, parameters.frame_skip, parameters.death_ends_episode, parameters.max_start_nullops, rng, length_in_episodes=parameters.episodes) experiment.run()