def test_memory(insertions, samples, img_shape, misc_len, batch_size, capacity, img_dtype=np.float32): print("image shape:", img_shape) print("misc vector lenght:", misc_len) print("batchsize:", batch_size) print("capacity:", capacity) print("image data type:", img_dtype.__name__) memory = ReplayMemory(img_shape, misc_len, capacity, batch_size) if img_dtype != np.float32: s = [(np.random.random(img_shape) * 255).astype(img_dtype), np.random.random(misc_len).astype(np.float32)] s2 = [(np.random.random(img_shape) * 255).astype(img_dtype), np.random.random(misc_len).astype(np.float32)] else: s = [ np.random.random(img_shape).astype(img_dtype), np.random.random(misc_len).astype(np.float32) ] s2 = [ np.random.random(img_shape).astype(img_dtype), np.random.random(misc_len).astype(np.float32) ] a = 0 r = 1.0 terminal = False for _ in trange(capacity, leave=False, desc="Prefilling memory."): memory.add_transition(s, a, s2, r, terminal) start = time() for _ in trange(insertions, leave=False, desc="Testing insertions speed"): memory.add_transition(s, a, s2, r, terminal) inserts_time = time() - start start = time() for _ in trange(samples, leave=False, desc="Testing sampling speed"): sample = memory.get_sample() sample_time = time() - start print("\t{:0.1f} insertions/s. 1k insertions in: {:0.2f}s".format( insertions / inserts_time, inserts_time / insertions * 1000)) print("\t{:0.1f} samples/s. 1k samples in: {:0.2f}s".format( samples / sample_time, sample_time / samples * 1000)) print()
class QEngine: def __init__(self, **kwargs): self.setup = kwargs self._initialize(**kwargs) if "game" in kwargs: del kwargs["game"] def _prepare_for_save(self): self.setup["epsilon"] = self.epsilon self.setup["steps"] = self.steps self.setup["skiprate"] = self.skiprate # TODO why isn't it in init? # There was some reason but can't remember it now. def _initialize(self, game=None, network_args=None, actions=None, name=None, net_type="dqn", # TODO change to the actual class name? reshaped_x=None, reshaped_y=None, skiprate=3, history_length=4, batchsize=64, update_pattern=(1, 1), replay_memory_size=10000, backprop_start_step=10000, start_epsilon=1.0, end_epsilon=0.1, epsilon_decay_start_step=50000, epsilon_decay_steps=100000, reward_scale=1.0, # TODO useless? melt_steps=10000, shaping_on=False, count_time=False, one_hot_time=False, count_time_interval=1, count_time_max=2100, use_game_variables=True, rearrange_misc=False, remember_n_actions=4, one_hot_nactions=False, misc_scale=None, # TODO seems useless results_file=None, params_file=None, config_file=None, no_timeout_terminal=False # TODO seems useless ): if game is not None: self.game = game self.config_file = None elif config_file is not None: self.config_file = config_file self.game = initialize_doom(self.config_file) else: raise Exception("No game, no config file. Dunno how to initialize doom.") if network_args is None: network_args = dict() if count_time: self.count_time = bool(count_time) if self.count_time: self.one_hot_time = one_hot_time self.count_time_max = int(count_time_max) self.count_time_interval = int(count_time_interval) if one_hot_time: self.count_time_len = int(self.count_time_max / self.count_time_interval) else: self.count_time_len = 1 else: self.count_time_len = 0 self.count_time = False self.name = name if reward_scale is not None: self.reward_scale = reward_scale else: self.reward_scale = 1.0 self.rearrange_misc = rearrange_misc self.batchsize = batchsize self.history_length = max(history_length, 1) self.update_pattern = update_pattern self.epsilon = max(min(start_epsilon, 1.0), 0.0) self.end_epsilon = min(max(end_epsilon, 0.0), self.epsilon) self.epsilon_decay_steps = epsilon_decay_steps self.epsilon_decay_stride = (self.epsilon - end_epsilon) / epsilon_decay_steps self.epsilon_decay_start = epsilon_decay_start_step self.skiprate = max(skiprate, 0) self.shaping_on = shaping_on self.steps = 0 self.melt_steps = melt_steps self.backprop_start_step = max(backprop_start_step, batchsize) self.one_hot_nactions = one_hot_nactions self.no_timeout_terminal = no_timeout_terminal if results_file: self.results_file = results_file else: self.results_file = "results/" + name + ".res" if params_file: self.params_file = params_file else: self.params_file = "params/" + name if self.game.get_available_game_variables_size() > 0 and use_game_variables: self.use_game_variables = True else: self.use_game_variables = False self.last_shaping_reward = 0 self.learning_mode = True if actions is None: self.actions = generate_default_actions(self.game) else: self.actions = actions self.actions_num = len(self.actions) self.actions_stats = np.zeros([self.actions_num], np.int) # changes img_shape according to the history size self.channels = self.game.get_screen_channels() if self.history_length > 1: self.channels *= self.history_length if reshaped_x is None: x = self.game.get_screen_width() y = self.game.get_screen_height() scale_x = scale_y = 1.0 else: x = reshaped_x scale_x = float(x) / self.game.get_screen_width() if reshaped_y is None: y = int(self.game.get_screen_height() * scale_x) scale_y = scale_x else: y = reshaped_y scale_y = float(y) / self.game.get_screen_height() img_shape = [self.channels, y, x] # TODO check if it is slow (it seems that no) if scale_x == 1 and scale_y == 1: def convert(img): img = img.astype(np.float32) / 255.0 return img else: def convert(img): img = img.astype(np.float32) / 255.0 new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype) for i in xrange(img.shape[0]): # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True) new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA) return new_image self.convert_image = convert if self.use_game_variables: single_state_misc_len = int(self.game.get_available_game_variables_size() + self.count_time_len) else: single_state_misc_len = int(self.count_time_len) self.single_state_misc_len = single_state_misc_len self.remember_n_actions = remember_n_actions total_misc_len = int(single_state_misc_len * self.history_length) if remember_n_actions > 0: self.remember_n_actions = remember_n_actions if self.one_hot_nactions: self.action_len = int(2 ** floor(log(len(self.actions), 2))) else: self.action_len = len(self.actions[0]) self.last_action = np.zeros([self.action_len], dtype=np.float32) self.last_n_actions = np.zeros([remember_n_actions * self.action_len], dtype=np.float32) total_misc_len += len(self.last_n_actions) if total_misc_len > 0: self.misc_state_included = True self.current_misc_state = np.zeros(total_misc_len, dtype=np.float32) if single_state_misc_len > 0: if misc_scale is not None: self.misc_scale = np.array(misc_scale, dtype=np.float32) else: self.misc_scale = None else: self.misc_state_included = False state_format = dict() state_format["s_img"] = img_shape state_format["s_misc"] = total_misc_len self.replay_memory = ReplayMemory(state_format, replay_memory_size, batchsize) network_args["state_format"] = state_format network_args["actions_number"] = len(self.actions) if net_type in ("dqn", None, ""): self.approximator = approximators.DQN(**network_args) elif net_type in ["duelling", "dueling"]: self.approximator = approximators.DuelingDQN(**network_args) else: if locate('approximators.' + net_type) is not None: self.approximator = locate('approximators.' + net_type)(**network_args) else: raise Exception("Unsupported approximator type.") self.current_image_state = np.zeros(img_shape, dtype=np.float32) def _update_state(self): raw_state = self.game.get_state() img = self.convert_image(raw_state.image_buffer) state_misc = None if self.single_state_misc_len > 0: state_misc = np.zeros(self.single_state_misc_len, dtype=np.float32) if self.use_game_variables: game_variables = raw_state.game_variables.astype(np.float32) state_misc[0:len(game_variables)] = game_variables count_time_start = len(game_variables) else: count_time_start = 0 if self.count_time: raw_time = raw_state.number processed_time = int(min(self.count_time_max, raw_time) / self.count_time_interval) if self.one_hot_time: num_one_hot = processed_time - 1 state_number = np.zeros([self.count_time_len], dtype=np.float32) state_number[num_one_hot] = 1 ''' # TODO make it available in options # HACK1 that uses health and count as one hot at once hp = int(raw_state.game_variables[0]) state = raw_time state_number = np.zeros([self.count_time_len], dtype=np.float32) state_number[hp - 1] = 1 state_number[99 + state] = 1 # HACK1 ends ''' ''' # TODO make it available in options # HACK2 that uses health as one hot hp = int(raw_state.game_variables[0]) state_number = np.zeros([self.count_time_len], dtype=np.float32) state_number[hp - 1] = 1 # HACK2 ends ''' else: state_number = processed_time state_misc[count_time_start:] = state_number if self.misc_scale is not None: state_misc = state_misc * self.misc_scale if self.history_length > 1: pure_channels = self.channels / self.history_length self.current_image_state[0:-pure_channels] = self.current_image_state[pure_channels:] self.current_image_state[-pure_channels:] = img if self.single_state_misc_len > 0: misc_len = len(state_misc) hist_len = self.history_length # TODO don't move count_time when it's one hot - it's useless and performance drops slightly if self.rearrange_misc: for i in xrange(misc_len): cms_part = self.current_misc_state[i * hist_len:(i + 1) * hist_len] cms_part[0:hist_len - 1] = cms_part[1:] cms_part[-1] = state_misc[i] else: cms = self.current_misc_state cms[0:(hist_len - 1) * misc_len] = cms[misc_len:hist_len * misc_len] cms[(hist_len - 1) * misc_len:hist_len * misc_len] = state_misc else: self.current_image_state[:] = img if self.single_state_misc_len > 0: self.current_misc_state[0:len(state_misc)] = state_misc if self.remember_n_actions: self.last_n_actions[:-self.action_len] = self.last_n_actions[self.action_len:] self.last_n_actions[-self.action_len:] = self.last_action self.current_misc_state[-len(self.last_n_actions):] = self.last_n_actions def new_episode(self, update_state=False): self.game.new_episode() self.reset_state() self.last_shaping_reward = 0 if update_state: self._update_state() def set_last_action(self, index): if self.one_hot_nactions: self.last_action.fill(0) self.last_action[index] = 1 else: self.last_action[:] = self.actions[index] # Return current state including history def _current_state(self): if self.misc_state_included: s = [self.current_image_state, self.current_misc_state] else: s = [self.current_image_state] return s # Return current state's COPY including history. def _current_state_copy(self): if self.misc_state_included: s = [self.current_image_state.copy(), self.current_misc_state.copy()] else: s = [self.current_image_state.copy()] return s # Sets the whole state to zeros. def reset_state(self): self.current_image_state.fill(0.0) if self.misc_state_included: self.current_misc_state.fill(0.0) if self.remember_n_actions > 0: self.set_last_action(0) self.last_n_actions.fill(0) def make_step(self): self._update_state() # TODO Check if not making the copy still works a = self.approximator.estimate_best_action(self._current_state_copy()) self.actions_stats[a] += 1 self.game.make_action(self.actions[a], self.skiprate + 1) if self.remember_n_actions: self.set_last_action(a) def make_sleep_step(self, sleep_time=1 / 35.0): self._update_state() a = self.approximator.estimate_best_action(self._current_state_copy()) self.actions_stats[a] += 1 self.game.set_action(self.actions[a]) if self.remember_n_actions: self.set_last_action(a) for i in xrange(self.skiprate): self.game.advance_action(1, False, True) sleep(sleep_time) self.game.advance_action() sleep(sleep_time) def check_timeout(self): return (self.game.get_episode_time() - self.game.get_episode_start_time() >= self.game.get_episode_timeout()) # Performs a learning step according to epsilon-greedy policy. # The step spans self.skiprate +1 actions. def make_learning_step(self): self.steps += 1 # epsilon decay if self.steps > self.epsilon_decay_start and self.epsilon > self.end_epsilon: self.epsilon = max(self.epsilon - self.epsilon_decay_stride, 0) # Copy because state will be changed in a second s = self._current_state_copy(); # With probability epsilon choose a random action: if self.epsilon >= random.random(): a = random.randint(0, len(self.actions) - 1) else: a = self.approximator.estimate_best_action(s) self.actions_stats[a] += 1 # make action and get the reward if self.remember_n_actions: self.set_last_action(a) r = self.game.make_action(self.actions[a], self.skiprate + 1) r = np.float32(r) if self.shaping_on: sr = np.float32(doom_fixed_to_double(self.game.get_game_variable(GameVariable.USER1))) r += sr - self.last_shaping_reward self.last_shaping_reward = sr r *= self.reward_scale # update state s2 accordingly and add transition if self.game.is_episode_finished(): if (not self.no_timeout_terminal) or (not self.check_timeout()): s2 = None self.replay_memory.add_transition(s, a, s2, r, terminal=True) else: self._update_state() s2 = self._current_state() self.replay_memory.add_transition(s, a, s2, r, terminal=False) # Perform q-learning once for a while if self.replay_memory.size >= self.backprop_start_step and self.steps % self.update_pattern[0] == 0: for a in xrange(self.update_pattern[1]): self.approximator.learn(self.replay_memory.get_sample()) # Melt the network sometimes if self.steps % self.melt_steps == 0: self.approximator.melt() # Runs a single episode in current mode. It ignores the mode if learn==true/false def run_episode(self, sleep_time=0): self.new_episode() if sleep_time == 0: while not self.game.is_episode_finished(): self.make_step() else: while not self.game.is_episode_finished(): self.make_sleep_step(sleep_time) return np.float32(self.game.get_total_reward()) # Utility stuff def get_actions_stats(self, clear=False, norm=True): stats = self.actions_stats.copy() if norm: stats = stats / np.float32(self.actions_stats.sum()) stats[stats == 0.0] = -1 stats = np.around(stats, 3) if clear: self.actions_stats.fill(0) return stats def get_steps(self): return self.steps def get_epsilon(self): return self.epsilon def get_network(self): return self.approximator.network def set_epsilon(self, eps): self.epsilon = eps def set_skiprate(self, skiprate): self.skiprate = max(skiprate, 0) def get_skiprate(self): return self.skiprate def get_mean_loss(self): return self.approximator.get_mean_loss() # Saves network weights to a file def save_params(self, filename, quiet=False): if not quiet: print "Saving network weights to " + filename + "..." self._prepare_for_save() params = get_all_param_values(self.approximator.network) pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished." # Loads network weights from the file def load_params(self, filename, quiet=False): if not quiet: print "Loading network weights from " + filename + "..." params = pickle.load(open(filename, "rb")) set_all_param_values(self.approximator.network, params) set_all_param_values(self.approximator.frozen_network, params) if not quiet: print "Loading finished." # Loads the whole engine with params from file def get_network_architecture(self): return get_all_param_values(self.get_network()) def print_setup(self): print "\nNetwork architecture:" for p in self.get_network_architecture(): print p.shape print "\n*** Engine setup ***" for k in self.setup.keys(): if k == "network_args": print"network_args:" net_args = self.setup[k] for k2 in net_args.keys(): print "\t", k2, ":", net_args[k2] else: print k, ":", self.setup[k] @staticmethod def load(filename, game=None, config_file=None, quiet=False): if not quiet: print "Loading qengine from " + filename + "..." params = pickle.load(open(filename, "rb")) qengine_args = params[0] network_weights = params[1] steps = qengine_args["steps"] epsilon = qengine_args["epsilon"] del (qengine_args["epsilon"]) del (qengine_args["steps"]) if game is None: if config_file is not None: game = initialize_doom(config_file) qengine_args["config_file"] = config_file elif "config_file" in qengine_args and qengine_args["config_file"] is not None: game = initialize_doom(qengine_args["config_file"]) else: raise Exception("No game, no config file. Dunno how to initialize doom.") else: qengine_args["config_file"] = None qengine_args["game"] = game qengine = QEngine(**qengine_args) set_all_param_values(qengine.approximator.network, network_weights) set_all_param_values(qengine.approximator.frozen_network, network_weights) if not quiet: print "Loading finished." qengine.steps = steps qengine.epsilon = epsilon return qengine # Saves the whole engine with params to a file def save(self, filename=None, quiet=False): if filename is None: filename = self.params_file if not quiet: print "Saving qengine to " + filename + "..." self._prepare_for_save() network_params = get_all_param_values(self.approximator.network) params = [self.setup, network_params] pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished."
class DQN(object): def __init__(self, scenario_tag=None, run_id_string=None, network_type="networks.DQNNet", write_summaries=True, tf_logdir="tensorboard_logs", epochs=100, train_steps_per_epoch=1000000, test_episodes_per_epoch=100, run_tests=True, initial_epsilon=1.0, final_epsilon=0.0000, epsilon_decay_steps=10e07, epsilon_decay_start_step=2e05, frozen_steps=5000, batchsize=32, memory_capacity=10000, update_pattern=(4, 4), prioritized_memory=False, enable_progress_bar=True, save_interval=1, writer_max_queue=10, writer_flush_secs=120, dynamic_frameskips=None, **settings): if prioritized_memory: raise NotImplementedError("Prioritized memory not implemented. Maybe some day.") # TODO maybe some day ... pass if dynamic_frameskips: if isinstance(dynamic_frameskips, (list, tuple)): self.frameskips = list(dynamic_frameskips) elif isinstance(dynamic_frameskips, int): self.frameskips = list(range(1, dynamic_frameskips + 1)) else: self.frameskips = [None] self.update_pattern = update_pattern self.write_summaries = write_summaries self._settings = settings self.run_id_string = run_id_string self.train_steps_per_epoch = train_steps_per_epoch self._run_tests = test_episodes_per_epoch > 0 and run_tests self.test_episodes_per_epoch = test_episodes_per_epoch self._epochs = np.float32(epochs) self.doom_wrapper = VizdoomWrapper(**settings) misc_len = self.doom_wrapper.misc_len img_shape = self.doom_wrapper.img_shape self.use_misc = self.doom_wrapper.use_misc self.actions_num = self.doom_wrapper.actions_num self.replay_memory = ReplayMemory(img_shape, misc_len, batch_size=batchsize, capacity=memory_capacity) self.network = eval(network_type)(actions_num=self.actions_num * len(self.frameskips), img_shape=img_shape, misc_len=misc_len, **settings) self.batchsize = batchsize self.frozen_steps = frozen_steps self.save_interval = save_interval self._model_savefile = settings["models_path"] + "/" + self.run_id_string ## TODO move summaries somewhere so they are consistent between dqn and asyncs if self.write_summaries: assert tf_logdir is not None if not os.path.isdir(tf_logdir): os.makedirs(tf_logdir) self.scores_placeholder, summaries = setup_vector_summaries(scenario_tag + "/scores") self._summaries = tf.summary.merge(summaries) self._train_writer = tf.summary.FileWriter("{}/{}/{}".format(tf_logdir, self.run_id_string, "train"), flush_secs=writer_flush_secs, max_queue=writer_max_queue) self._test_writer = tf.summary.FileWriter("{}/{}/{}".format(tf_logdir, self.run_id_string, "test"), flush_secs=writer_flush_secs, max_queue=writer_max_queue) else: self._train_writer = None self._test_writer = None self._summaries = None self.steps = 0 # TODO epoch as tf variable? self._epoch = 1 # Epsilon self.epsilon_decay_rate = (initial_epsilon - final_epsilon) / epsilon_decay_steps self.epsilon_decay_start_step = epsilon_decay_start_step self.initial_epsilon = initial_epsilon self.final_epsilon = final_epsilon self.enable_progress_bar = enable_progress_bar def get_current_epsilon(self): eps = self.initial_epsilon - (self.steps - self.epsilon_decay_start_step) * self.epsilon_decay_rate return np.clip(eps, self.final_epsilon, 1.0) def get_action_and_frameskip(self, ai): action = ai % self.actions_num frameskip = self.frameskips[ai // self.actions_num] return action, frameskip @staticmethod def print_epoch_log(prefix, scores, steps, epoch_time): mean_score = np.mean(scores) score_std = np.std(scores) min_score = np.min(scores) max_score = np.max(scores) episodes = len(scores) steps_per_sec = steps / epoch_time mil_steps_per_hour = steps_per_sec * 3600 / 1000000.0 log( "{}: Episodes: {}, mean: {}, min: {}, max: {}, " " Speed: {:.0f} STEPS/s, {:.2f}M STEPS/hour, time: {}".format( prefix, episodes, green("{:0.3f}±{:0.2f}".format(mean_score, score_std)), red("{:0.3f}".format(min_score)), blue("{:0.3f}".format(max_score)), steps_per_sec, mil_steps_per_hour, sec_to_str(epoch_time) )) def save_model(self, session, savefile=None): if savefile is None: savefile = self._model_savefile savedir = os.path.dirname(savefile) if not os.path.exists(savedir): log("Creating directory: {}".format(savedir)) os.makedirs(savedir) log("Saving model to: {}".format(savefile)) saver = tf.train.Saver() saver.save(session, savefile) def load_model(self, session, savefile): saver = tf.train.Saver() log("Loading model from: {}".format(savefile)) saver.restore(session, savefile) log("Loaded model.") def train(self, session): # Prefill replay memory: for _ in trange(self.replay_memory.capacity, desc="Filling replay memory", leave=False, disable=not self.enable_progress_bar, file=sys.stdout): if self.doom_wrapper.is_terminal(): self.doom_wrapper.reset() s1 = self.doom_wrapper.get_current_state() action_frameskip_index = randint(0, self.actions_num * len(self.frameskips) - 1) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) reward = self.doom_wrapper.make_action(action_index, frameskip) terminal = self.doom_wrapper.is_terminal() s2 = self.doom_wrapper.get_current_state() self.replay_memory.add_transition(s1, action_frameskip_index, s2, reward, terminal) overall_start_time = time() self.network.update_target_network(session) log(green("Starting training.\n")) while self._epoch <= self._epochs: self.doom_wrapper.reset() train_scores = [] test_scores = [] train_start_time = time() for _ in trange(self.train_steps_per_epoch, desc="Training, epoch {}".format(self._epoch), leave=False, disable=not self.enable_progress_bar, file=sys.stdout): self.steps += 1 s1 = self.doom_wrapper.get_current_state() if random() <= self.get_current_epsilon(): action_frameskip_index = randint(0, self.actions_num*len(self.frameskips) - 1) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) else: action_frameskip_index = self.network.get_action(session, s1) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) reward = self.doom_wrapper.make_action(action_index, frameskip) terminal = self.doom_wrapper.is_terminal() s2 = self.doom_wrapper.get_current_state() self.replay_memory.add_transition(s1, action_frameskip_index, s2, reward, terminal) if self.steps % self.update_pattern[0] == 0: for _ in range(self.update_pattern[1]): self.network.train_batch(session, self.replay_memory.get_sample()) if terminal: train_scores.append(self.doom_wrapper.get_total_reward()) self.doom_wrapper.reset() if self.steps % self.frozen_steps == 0: self.network.update_target_network(session) train_time = time() - train_start_time log("Epoch {}".format(self._epoch)) log("Training steps: {}, epsilon: {}".format(self.steps, self.get_current_epsilon())) self.print_epoch_log("TRAIN", train_scores, self.train_steps_per_epoch, train_time) test_start_time = time() test_steps = 0 # TESTING for _ in trange(self.test_episodes_per_epoch, desc="Testing, epoch {}".format(self._epoch), leave=False, disable=not self.enable_progress_bar, file=sys.stdout): self.doom_wrapper.reset() while not self.doom_wrapper.is_terminal(): test_steps += 1 state = self.doom_wrapper.get_current_state() action_frameskip_index = self.network.get_action(session, state) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) self.doom_wrapper.make_action(action_index, frameskip) test_scores.append(self.doom_wrapper.get_total_reward()) test_time = time() - test_start_time self.print_epoch_log("TEST", test_scores, test_steps, test_time) if self.write_summaries: log("Writing summaries.") train_summary = session.run(self._summaries, {self.scores_placeholder: train_scores}) self._train_writer.add_summary(train_summary, self.steps) if self._run_tests: test_summary = session.run(self._summaries, {self.scores_placeholder: test_scores}) self._test_writer.add_summary(test_summary, self.steps) # Save model if self._epoch % self.save_interval == 0: savedir = os.path.dirname(self._model_savefile) if not os.path.exists(savedir): log("Creating directory: {}".format(savedir)) os.makedirs(savedir) log("Saving model to: {}".format(self._model_savefile)) saver = tf.train.Saver() saver.save(session, self._model_savefile) overall_time = time() - overall_start_time log("Total elapsed time: {}\n".format(sec_to_str(overall_time))) self._epoch += 1 def run_test_episode(self, session): self.doom_wrapper.reset() while not self.doom_wrapper.is_terminal(): state = self.doom_wrapper.get_current_state() action_frameskip_index = self.network.get_action(session, state) action_index, frameskip = self.get_action_and_frameskip(action_frameskip_index) self.doom_wrapper.make_action(action_index, frameskip) reward = self.doom_wrapper.get_total_reward() return reward
class QEngine: def __init__(self, **kwargs): self.setup = kwargs self._initialize(**kwargs) del kwargs["game"] def _prepare_for_save(self): self.setup["epsilon"] = self._epsilon self.setup["steps"] = self._steps self.setup["skiprate"] = self._skiprate # TODO why the f**k isn't it in init? def _initialize(self, game, network_args=None, actions=None, history_length=4, batchsize=64, update_pattern=(1, 1), replay_memory_size=10000, backprop_start_step=10000, start_epsilon=1.0, end_epsilon=0.1, epsilon_decay_start_step=50000, epsilon_decay_steps=100000, reward_scale=1.0, use_game_variables=True, misc_scale=None, reshaped_x=None, reshaped_y=None, skiprate=4, shaping_on=False, count_states=False, name=None, net_type="cnn", melt_steps=10000, remember_n_actions=0): if network_args is None: network_args = dict() if count_states is not None: self._count_states = bool(count_states) self.name = name self._reward_scale = reward_scale self._game = game self._batchsize = batchsize self._history_length = max(history_length, 1) self._update_pattern = update_pattern self._epsilon = max(min(start_epsilon, 1.0), 0.0) self._end_epsilon = min(max(end_epsilon, 0.0), self._epsilon) self._epsilon_decay_steps = epsilon_decay_steps self._epsilon_decay_stride = (self._epsilon - end_epsilon) / epsilon_decay_steps self._epsilon_decay_start = epsilon_decay_start_step self._skiprate = max(skiprate, 0) self._shaping_on = shaping_on self._steps = 0 self._melt_steps = melt_steps self._backprop_start_step = max(backprop_start_step, batchsize) self._use_game_variables = use_game_variables self._last_action_index = 0 if self._shaping_on: self._last_shaping_reward = 0 self.learning_mode = True if actions is None: self._actions = generate_default_actions(game) else: self._actions = actions self._actions_num = len(self._actions) self._actions_stats = np.zeros([self._actions_num], np.int) # changes img_shape according to the history size self._channels = game.get_screen_channels() if self._history_length > 1: self._channels *= self._history_length if reshaped_x is None: x = game.get_screen_width() y = game.get_screen_height() scale_x = scale_y = 1.0 else: x = reshaped_x scale_x = float(x) / game.get_screen_width() if reshaped_y is None: y = int(game.get_screen_height() * scale_x) scale_y = scale_x else: y = reshaped_y scale_y = float(y) / game.get_screen_height() img_shape = [self._channels, y, x] # TODO check if it is slow (it seems that no) if scale_x == 1 and scale_y == 1: def convert(img): img = img.astype(np.float32) / 255.0 return img else: def convert(img): img = img.astype(np.float32) / 255.0 new_image = np.ndarray([img.shape[0], y, x], dtype=img.dtype) for i in xrange(img.shape[0]): # new_image[i] = skimage.transform.resize(img[i], (y,x), preserve_range=True) new_image[i] = cv2.resize(img[i], (x, y), interpolation=cv2.INTER_AREA) return new_image self._convert_image = convert if self._use_game_variables: single_state_misc_len = game.get_available_game_variables_size() + int(self._count_states) else: single_state_misc_len = int(self._count_states) self._single_state_misc_len = single_state_misc_len self._remember_n_actions = remember_n_actions if remember_n_actions > 0: self._remember_n_actions = remember_n_actions self._action_len = len(self._actions[0]) self._last_n_actions = np.zeros([remember_n_actions * self._action_len], dtype=np.float32) self._total_misc_len = single_state_misc_len * self._history_length + len(self._last_n_actions) self._last_action_index = 0 else: self._total_misc_len = single_state_misc_len * self._history_length if self._total_misc_len > 0: self._misc_state_included = True self._current_misc_state = np.zeros(self._total_misc_len, dtype=np.float32) if single_state_misc_len > 0: self._state_misc_buffer = np.zeros(single_state_misc_len, dtype=np.float32) if misc_scale is not None: self._misc_scale = np.array(misc_scale, dtype=np.float32) else: self._misc_scale = None else: self._misc_state_included = False state_format = dict() state_format["s_img"] = img_shape state_format["s_misc"] = self._total_misc_len self._transitions = ReplayMemory(state_format, replay_memory_size, batchsize) network_args["state_format"] = state_format network_args["actions_number"] = len(self._actions) if net_type in ("dqn", None, ""): self._evaluator = DQN(**network_args) elif net_type == "duelling": self._evaluator = DuellingDQN(**network_args) else: print "Unsupported evaluator type." exit(1) # TODO throw. . .? self._current_image_state = np.zeros(img_shape, dtype=np.float32) def _update_state(self): raw_state = self._game.get_state() img = self._convert_image(raw_state.image_buffer) state_misc = None if self._single_state_misc_len > 0: state_misc = self._state_misc_buffer if self._use_game_variables: game_variables = raw_state.game_variables.astype(np.float32) state_misc[0:len(game_variables)] = game_variables if self._count_states: state_misc[-1] = raw_state.number if self._misc_scale is not None: state_misc = state_misc * self._misc_scale if self._history_length > 1: pure_channels = self._channels / self._history_length self._current_image_state[0:-pure_channels] = self._current_image_state[pure_channels:] self._current_image_state[-pure_channels:] = img if self._single_state_misc_len > 0: misc_len = len(state_misc) hist = self._history_length self._current_misc_state[0:(hist - 1) * misc_len] = self._current_misc_state[misc_len:hist * misc_len] self._current_misc_state[(hist - 1) * misc_len:hist * misc_len] = state_misc else: self._current_image_state[:] = img if self._single_state_misc_len > 0: self._current_misc_state[0:len(state_misc)] = state_misc if self._remember_n_actions: self._last_n_actions[:-self._action_len] = self._last_n_actions[self._action_len:] self._last_n_actions[-self._action_len:] = self._actions[self._last_action_index] self._current_misc_state[-len(self._last_n_actions):] = self._last_n_actions def new_episode(self, update_state=False): self._game.new_episode() self.reset_state() self._last_shaping_reward = 0 if update_state: self._update_state() # Return current state including history def _current_state(self): if self._misc_state_included: s = [self._current_image_state, self._current_misc_state] else: s = [self._current_image_state] return s # Return current state's COPY including history. def _current_state_copy(self): if self._misc_state_included: s = [self._current_image_state.copy(), self._current_misc_state.copy()] else: s = [self._current_image_state.copy()] return s # Sets the whole state to zeros. def reset_state(self): self._current_image_state.fill(0.0) self._last_action_index = 0 if self._misc_state_included: self._current_misc_state.fill(0.0) if self._remember_n_actions > 0: self._last_n_actions.fill(0) def make_step(self): self._update_state() # TODO Check if not making the copy still works a = self._evaluator.estimate_best_action(self._current_state_copy()) self._actions_stats[a] += 1 self._game.make_action(self._actions[a], self._skiprate + 1) self._last_action_index = a def make_sleep_step(self, sleep_time=1 / 35.0): self._update_state() a = self._evaluator.estimate_best_action(self._current_state_copy()) self._actions_stats[a] += 1 self._game.set_action(self._actions[a]) self._last_action_index = a for i in xrange(self._skiprate): self._game.advance_action(1, False, True) sleep(sleep_time) self._game.advance_action() sleep(sleep_time) # Performs a learning step according to epsilon-greedy policy. # The step spans self._skiprate +1 actions. def make_learning_step(self): self._steps += 1 # epsilon decay if self._steps > self._epsilon_decay_start and self._epsilon > self._end_epsilon: self._epsilon = max(self._epsilon - self._epsilon_decay_stride, 0) # Copy because state will be changed in a second s = self._current_state_copy(); # With probability epsilon choose a random action: if self._epsilon >= random.random(): a = random.randint(0, len(self._actions) - 1) else: a = self._evaluator.estimate_best_action(s) self._actions_stats[a] += 1 # make action and get the reward self._last_action_index = a r = self._game.make_action(self._actions[a], self._skiprate + 1) r = np.float32(r) if self._shaping_on: sr = np.float32(doom_fixed_to_double(self._game.get_game_variable(GameVariable.USER1))) r += sr - self._last_shaping_reward self._last_shaping_reward = sr r *= self._reward_scale # update state s2 accordingly if self._game.is_episode_finished(): # terminal state s2 = None self._transitions.add_transition(s, a, s2, r, terminal=True) else: self._update_state() s2 = self._current_state() self._transitions.add_transition(s, a, s2, r, terminal=False) # Perform q-learning once for a while if self._transitions.size >= self._backprop_start_step and self._steps % self._update_pattern[0] == 0: for a in xrange(self._update_pattern[1]): self._evaluator.learn(self._transitions.get_sample()) # Melt the network sometimes if self._steps % self._melt_steps == 0: self._evaluator.melt() # Adds a transition to the bank. def add_transition(self, s, a, s2, r, terminal): self._transitions.add_transition(s, a, s2, r, terminal) # Runs a single episode in current mode. It ignores the mode if learn==true/false def run_episode(self, sleep_time=0): self.new_episode() if sleep_time == 0: while not self._game.is_episode_finished(): self.make_step() else: while not self._game.is_episode_finished(): self.make_sleep_step(sleep_time) return np.float32(self._game.get_total_reward()) # Utility stuff def get_actions_stats(self, clear=False, norm=True): stats = self._actions_stats.copy() if norm: stats = stats / np.float32(self._actions_stats.sum()) stats[stats == 0.0] = -1 stats = np.around(stats, 3) if clear: self._actions_stats.fill(0) return stats def get_steps(self): return self._steps def get_epsilon(self): return self._epsilon def get_network(self): return self._evaluator.network def set_epsilon(self, eps): self._epsilon = eps def set_skiprate(self, skiprate): self._skiprate = max(skiprate, 0) def get_skiprate(self): return self._skiprate # Saves network weights to a file def save_params(self, filename, quiet=False): if not quiet: print "Saving network weights to " + filename + "..." self._prepare_for_save() params = get_all_param_values(self._evaluator.network) pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished." # Loads network weights from the file def load_params(self, filename, quiet=False): if not quiet: print "Loading network weights from " + filename + "..." params = pickle.load(open(filename, "rb")) set_all_param_values(self._evaluator.network, params) set_all_param_values(self._evaluator.frozen_network, params) if not quiet: print "Loading finished." # Loads the whole engine with params from file @staticmethod def load(game, filename, quiet=False): if not quiet: print "Loading qengine from " + filename + "..." params = pickle.load(open(filename, "rb")) qengine_args = params[0] network_params = params[1] steps = qengine_args["steps"] epsilon = qengine_args["epsilon"] del (qengine_args["epsilon"]) del (qengine_args["steps"]) qengine_args["game"] = game qengine = QEngine(**qengine_args) set_all_param_values(qengine._evaluator.network, network_params) set_all_param_values(qengine._evaluator.frozen_network, network_params) if not quiet: print "Loading finished." qengine._steps = steps qengine._epsilon = epsilon return qengine # Saves the whole engine with params to a file def save(self, filename, quiet=False): if not quiet: print "Saving qengine to " + filename + "..." self._prepare_for_save() network_params = get_all_param_values(self._evaluator.network) params = [self.setup, network_params] pickle.dump(params, open(filename, "wb")) if not quiet: print "Saving finished."