class FileWatcher(object): EVENT_MODIFIED = 2 EVENT_RENAMED = 2048 def __init__(self, path): self.path = path self.open_file() self.bind_events() self.on_more_content = Signal() def on_file_changed(self, ignored, filepath, mask): handler = {self.EVENT_MODIFIED: self.read_more, self.EVENT_RENAMED: self.rebind_events}.get(mask) if not handler: print "Ignoring event %s (%s)" % (mask, inotify.humanReadableMask(mask)) return return handler() def read_more(self): content = self.file.read() self.on_more_content.emit(content) def rebind_events(self): self.open_file() self.bind_events() def open_file(self): self.file = open(self.path, "r") def bind_events(self): self.notifier = inotify.INotify() self.notifier.startReading() self.notifier.watch(filepath.FilePath(self.path), callbacks=[self.on_file_changed])
class StatusTextManager(object): def __init__(self): self.messages = [] self.sig_changed = Signal() CommManager().register_model(self) @property def last_message(self): return self.messages[-1] def add_message(self, msg): self.messages.append((time(), msg)) if msg.severity == msgs.StatusText.DEBUG: ecu_log.debug(msg.text) elif msg.severity == msgs.StatusText.INFO: ecu_log.info(msg.text) elif msg.severity == msgs.StatusText.WARN: ecu_log.warn(msg.text) elif msg.severity == msgs.StatusText.ERROR: ecu_log.error(msg.text) elif msg.severity == msgs.StatusText.FAILURE: ecu_log.fatal(msg.text) else: ecu_log.debug("SEV(%s): %s", msg.severity, msg.text) self.sig_changed.emit() def clear(self): self.messages = []
def convert_samples_to_signals(samples: List[Sample]) -> List[Signal]: signals = [] for index in range(len(samples) - 1): length = samples[index + 1].timestamp - samples[index].timestamp value = samples[index].value signals.append(Signal(length=length, value=value)) signals.append(Signal(length=signals[-1].length, value=samples[-1].value)) return signals
class StatusManager(object): def __init__(self): self.last_message = None self.sig_changed = Signal() def update_status(self, msg): self.last_message = msg self.sig_changed.emit() def clear(self): pass
def _extract_single_signal( samples: List[Sample]) -> Tuple[Signal, List[Sample]]: value = samples[0].value start_time = samples[0].timestamp for index in range(1, len(samples)): diff = samples[index].timestamp - samples[index - 1].timestamp if diff > MIN_DIFF_BETWEEN_SIGNALS: signal = Signal(value=value, length=samples[index - 1].timestamp - start_time) return signal, samples[index - 1:] return Signal(value=value, length=samples[-1].timestamp - start_time), []
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True) self.last_gradient_update_step_idx = 0 self.q_values = Signal('Q Values') self.unclipped_grads = Signal('Grads (unclipped)') self.value_loss = Signal('Value Loss') self.signals.append(self.q_values) self.signals.append(self.unclipped_grads) self.signals.append(self.value_loss)
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True): Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', self.replicated_device, self.worker_device) self.networks.append(self.main_network) self.q_values = Signal("Q") self.signals.append(self.q_values) self.reset_game(do_not_reset_env=True)
def __init__(self): self.fleets = [] self.fleetsById = {} self.changed = Signal() self.finishes = [] self.finishesById = {} # we store these on the race manager so that they get pickled self.nextFleetId = 1 self.nextFinishId = 1
def join_signals(signals: List[Signal]) -> List[Signal]: stack = [] grouped_signals = [] for signal in signals: if signal.length < MIN_DIFF_BETWEEN_SIGNALS: stack.append(signal) else: if stack: new_signal = Signal(value=stack[0].value, length=sum([s.length for s in stack])) grouped_signals.append(new_signal) stack = [] grouped_signals.append(signal) new_signal = Signal(value=stack[0].value, length=sum([s.length for s in stack])) grouped_signals.append(new_signal) return grouped_signals
def merge_same_signals_into_one(signals: List[int], frequency: int) -> List[Signal]: grouped_signals = [] for signal_value, signal_group in groupby(signals): signal_length = frequency_to_milliseconds(frequency) * len( list(signal_group)) grouped_signals.append(Signal(length=signal_length, value=signal_value)) return grouped_signals
def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.l_values = Signal("L") self.a_values = Signal("Advantage") self.mu_values = Signal("Action") self.v_values = Signal("V") self.signals += [ self.l_values, self.a_values, self.mu_values, self.v_values ]
class ValueOptimizationAgent(Agent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0, create_target_network=True): Agent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.main_network = NetworkWrapper(tuning_parameters, create_target_network, self.has_global, 'main', self.replicated_device, self.worker_device) self.networks.append(self.main_network) self.q_values = Signal("Q") self.signals.append(self.q_values) self.reset_game(do_not_reset_env=True) # Algorithms for which q_values are calculated from predictions will override this function def get_q_values(self, prediction): return prediction def get_prediction(self, curr_state): return self.main_network.online_network.predict( self.tf_input_state(curr_state)) def _validate_action(self, policy, action): if np.array(action).shape != (): raise ValueError( ('The exploration_policy {} returned a vector of actions ' 'instead of a single action. ValueOptimizationAgents ' 'require exploration policies which return a single action.' ).format(policy.__class__.__name__)) def choose_action(self, curr_state, phase=RunPhase.TRAIN): prediction = self.get_prediction(curr_state) actions_q_values = self.get_q_values(prediction) # choose action according to the exploration policy and the current phase (evaluating or training the agent) if phase == RunPhase.TRAIN: exploration_policy = self.exploration_policy else: exploration_policy = self.evaluation_exploration_policy action = exploration_policy.get_action(actions_q_values) self._validate_action(exploration_policy, action) # this is for bootstrapped dqn if type(actions_q_values) == list and len(actions_q_values) > 0: actions_q_values = actions_q_values[ self.exploration_policy.selected_head] actions_q_values = actions_q_values.squeeze() # store the q values statistics for logging self.q_values.add_sample(actions_q_values) # store information for plotting interactively (actual plotting is done in agent) if self.tp.visualization.plot_action_values_online: for idx, action_name in enumerate(self.env.actions_description): self.episode_running_info[action_name].append( actions_q_values[idx]) action_value = {"action_value": actions_q_values[action]} return action, action_value
def __init__(self, path): self.path = path self.open_file() self.bind_events() self.on_more_content = Signal()
input_all = data[:, 1] * 5.86 * 0.01 # v_in: volts freq_all = data[:, 2] t_all = np.array([i * 0.002 for i, _ in enumerate(output_all)]) output = [] input = [] t = [] signal = [] for f in np.unique(freq_all): idxs = np.argwhere(freq_all == f) output.append(output_all[idxs].flatten()) input.append(input_all[idxs].flatten()) t.append(np.array([i * 0.01 for i, _ in enumerate(output[-1])])) try: signal.append(Signal(t[-1], input[-1], output[-1])) except: print(f) break fig = plt.figure(figsize=(9, 5)) ax1 = fig.add_subplot(111) ax1.plot(t_all, output_all, 'b') ax1.plot(t_all, input_all, 'r') ax1.grid() fig2 = plt.figure(figsize=(9, 5)) ax2 = fig2.add_subplot(211) ax3 = fig2.add_subplot(212) w = []
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id, create_target_network=True) self.last_gradient_update_step_idx = 0 self.q_values = Signal('Q Values') self.unclipped_grads = Signal('Grads (unclipped)') self.value_loss = Signal('Value Loss') self.signals.append(self.q_values) self.signals.append(self.unclipped_grads) self.signals.append(self.value_loss) def learn_from_batch(self, batch): # batch contains a list of episodes to learn from current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch( batch) # get the values for the current states state_value_head_targets = self.main_network.online_network.predict( current_states) # the targets for the state value estimator num_transitions = len(game_overs) if self.tp.agent.targets_horizon == '1-Step': # 1-Step Q learning q_st_plus_1 = self.main_network.target_network.predict(next_states) for i in reversed(range(num_transitions)): state_value_head_targets[i][actions[i]] = \ rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(q_st_plus_1[i], 0) elif self.tp.agent.targets_horizon == 'N-Step': # N-Step Q learning if game_overs[-1]: R = 0 else: R = np.max( self.main_network.target_network.predict( last_sample(next_states))) for i in reversed(range(num_transitions)): R = rewards[i] + self.tp.agent.discount * R state_value_head_targets[i][actions[i]] = R else: assert True, 'The available values for targets_horizon are: 1-Step, N-Step' # train result = self.main_network.online_network.accumulate_gradients( current_states, [state_value_head_targets]) # logging total_loss, losses, unclipped_grads = result[:3] self.unclipped_grads.add_sample(unclipped_grads) self.value_loss.add_sample(losses[0]) return total_loss def train(self): # update the target network of every network that has a target network if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0: for network in self.networks: network.update_target_network( self.tp.agent.rate_for_copying_weights_to_target) logger.create_signal_value('Update Target Network', 1) else: logger.create_signal_value('Update Target Network', 0, overwrite=False) return PolicyOptimizationAgent.train(self)
class ParamManager(object): def __init__(self): self.parameters = {} self.missing_ids = set() self._event = threading.Event() self.sig_changed = Signal() CommManager().register_model(self) @property def changed(self): return [p for p in self.parameters.values() if p.is_changed] def clear(self): self.parameters.clear() self.sig_changed.emit() def update_param(self, param_id, param_index, param_count, value): if len(self.missing_ids) == 0: self.missing_ids.update(range(param_count)) p = self.parameters.get(param_id) if p: p._value = value p._changed = False log.debug("Update: %s: %s", p.param_id, p.value) else: self.parameters[param_id] = Parameter(param_id, param_index, value) log.debug("Add: %s: %s", param_id, value) self.missing_ids.discard(param_index) if len(self.missing_ids) == 0: log.debug("Retrive done") self._event.set() def retrieve_all(self): self.missing_ids = set() self._event.clear() # request all CommManager().param_request() self._event.wait(10.0) # not nesessary: try to request missing params if len(self.missing_ids) > 0: log.warn("Missing %d parameters, trying to request.", len(self.missing_ids)) self._event.clear() for idx in self.missing_ids: CommManager().param_request(param_index=idx) self._event.wait(10.0) if len(self.missing_ids): log.error("Missing %d parameters", len(self.missing_ids)) self.sig_changed.emit() return len(self.missing_ids) == 0 def sync(self): to_sync = self.changed if len(to_sync) == 0: log.info("Nothing to sync") self.sig_changed.emit() return True self.missing_ids = set((p.param_index for p in to_sync)) self._event.clear() for p in to_sync: CommManager().param_set(p.param_id, p.value) self._event.wait(10.0) if len(self.missing_ids): log.error("Not synced %d parameters", len(self.missing_ids)) self.sig_changed.emit() return len(self.missing_ids) == 0
def __init__(self, path: str): self._path = path self._signal = Signal() self._db = self._load_db()
class Agent(object): def __init__(self, env, tuning_parameters, replicated_device=None, task_id=0): """ :param env: An environment instance :type env: EnvironmentWrapper :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset :param replicated_device: A tensorflow device for distributed training (optional) :type replicated_device: instancemethod :param thread_id: The current thread id :param thread_id: int """ screen.log_title("Creating agent {}".format(task_id)) self.task_id = task_id self.sess = tuning_parameters.sess self.env = tuning_parameters.env_instance = env self.imitation = False # i/o dimensions if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height: tuning_parameters.env.desired_observation_width = self.env.width tuning_parameters.env.desired_observation_height = self.env.height self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size if tuning_parameters.agent.use_accumulated_reward_as_measurement: self.measurements_size = tuning_parameters.env.measurements_size = ( self.measurements_size[0] + 1, ) # modules if tuning_parameters.agent.load_memory_from_file_path: screen.log_title( "Loading replay buffer from pickle. Pickle path: {}".format( tuning_parameters.agent.load_memory_from_file_path)) self.memory = read_pickle( tuning_parameters.agent.load_memory_from_file_path) else: self.memory = eval(tuning_parameters.memory + '(tuning_parameters)') # self.architecture = eval(tuning_parameters.architecture) self.has_global = replicated_device is not None self.replicated_device = replicated_device self.worker_device = "/job:worker/task:{}/cpu:0".format( task_id) if replicated_device is not None else "/gpu:0" self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)') self.evaluation_exploration_policy = eval( tuning_parameters.exploration.evaluation_policy + '(tuning_parameters)') self.evaluation_exploration_policy.change_phase(RunPhase.TEST) # initialize all internal variables self.tp = tuning_parameters self.in_heatup = False self.total_reward_in_current_episode = 0 self.total_steps_counter = 0 self.running_reward = None self.training_iteration = 0 self.current_episode = self.tp.current_episode = 0 self.curr_state = {} self.current_episode_steps_counter = 0 self.episode_running_info = {} self.last_episode_evaluation_ran = 0 self.running_observations = [] logger.set_current_time(self.current_episode) self.main_network = None self.networks = [] self.last_episode_images = [] self.renderer = Renderer() # signals self.signals = [] self.loss = Signal('Loss') self.signals.append(self.loss) self.curr_learning_rate = Signal('Learning Rate') self.signals.append(self.curr_learning_rate) if self.tp.env.normalize_observation and not self.env.is_state_type_image: if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: self.running_observation_stats = RunningStat( (self.tp.env.desired_observation_width, )) self.running_reward_stats = RunningStat(()) else: self.running_observation_stats = SharedRunningStats( self.tp, replicated_device, shape=(self.tp.env.desired_observation_width, ), name='observation_stats') self.running_reward_stats = SharedRunningStats( self.tp, replicated_device, shape=(), name='reward_stats') # env is already reset at this point. Otherwise we're getting an error where you cannot # reset an env which is not done self.reset_game(do_not_reset_env=True) # use seed if self.tp.seed is not None: random.seed(self.tp.seed) np.random.seed(self.tp.seed) def log_to_screen(self, phase): # log to screen if self.current_episode >= 0: if phase == RunPhase.TRAIN: exploration = self.exploration_policy.get_control_param() else: exploration = self.evaluation_exploration_policy.get_control_param( ) screen.log_dict(OrderedDict([ ("Worker", self.task_id), ("Episode", self.current_episode), ("total reward", self.total_reward_in_current_episode), ("exploration", exploration), ("steps", self.total_steps_counter), ("training iteration", self.training_iteration) ]), prefix=phase) def update_log(self, phase=RunPhase.TRAIN): """ Writes logging messages to screen and updates the log file with all the signal values. :return: None """ # log all the signals to file logger.set_current_time(self.current_episode) logger.create_signal_value('Training Iter', self.training_iteration) logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP)) logger.create_signal_value('ER #Transitions', self.memory.num_transitions()) logger.create_signal_value('ER #Episodes', self.memory.length()) logger.create_signal_value('Episode Length', self.current_episode_steps_counter) logger.create_signal_value('Total steps', self.total_steps_counter) logger.create_signal_value("Epsilon", self.exploration_policy.get_control_param()) logger.create_signal_value( "Training Reward", self.total_reward_in_current_episode if phase == RunPhase.TRAIN else np.nan) logger.create_signal_value( 'Evaluation Reward', self.total_reward_in_current_episode if phase == RunPhase.TEST else np.nan) logger.create_signal_value('Update Target Network', 0, overwrite=False) logger.update_wall_clock_time(self.current_episode) for signal in self.signals: logger.create_signal_value("{}/Mean".format(signal.name), signal.get_mean()) logger.create_signal_value("{}/Stdev".format(signal.name), signal.get_stdev()) logger.create_signal_value("{}/Max".format(signal.name), signal.get_max()) logger.create_signal_value("{}/Min".format(signal.name), signal.get_min()) # dump if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \ and self.current_episode > 0: logger.dump_output_csv() def reset_game(self, do_not_reset_env=False): """ Resets all the episodic parameters and start a new environment episode. :param do_not_reset_env: A boolean that allows prevention of environment reset :return: None """ for signal in self.signals: signal.reset() self.total_reward_in_current_episode = 0 self.curr_state = {} self.last_episode_images = [] self.current_episode_steps_counter = 0 self.episode_running_info = {} if not do_not_reset_env: self.env.reset() self.exploration_policy.reset() # required for online plotting if self.tp.visualization.plot_action_values_online: if hasattr(self, 'episode_running_info') and hasattr( self.env, 'actions_description'): for action in self.env.actions_description: self.episode_running_info[action] = [] plt.clf() if self.tp.agent.middleware_type == MiddlewareTypes.LSTM: for network in self.networks: network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init def preprocess_observation(self, observation): """ Preprocesses the given observation. For images - convert to grayscale, resize and convert to int. For measurements vectors - normalize by a running average and std. :param observation: The agents observation :return: A processed version of the observation """ if self.env.is_state_type_image: # rescale observation = scipy.misc.imresize( observation, (self.tp.env.desired_observation_height, self.tp.env.desired_observation_width), interp=self.tp.rescaling_interpolation_type) # rgb to y if len(observation.shape) > 2 and observation.shape[2] > 1: r, g, b = observation[:, :, 0], observation[:, :, 1], observation[:, :, 2] observation = 0.2989 * r + 0.5870 * g + 0.1140 * b # Render the processed observation which is how the agent will see it # Warning: this cannot currently be done in parallel to rendering the environment if self.tp.visualization.render_observation: if not self.renderer.is_open: self.renderer.create_screen(observation.shape[0], observation.shape[1]) self.renderer.render_image(observation) return observation.astype('uint8') else: if self.tp.env.normalize_observation: # standardize the input observation using a running mean and std if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: self.running_observation_stats.push(observation) observation = (observation - self.running_observation_stats.mean) / \ (self.running_observation_stats.std + 1e-15) observation = np.clip(observation, -5.0, 5.0) return observation def learn_from_batch(self, batch): """ Given a batch of transitions, calculates their target values and updates the network. :param batch: A list of transitions :return: The loss of the training """ pass def train(self): """ A single training iteration. Sample a batch, train on it and update target networks. :return: The training loss. """ batch = self.memory.sample(self.tp.batch_size) loss = self.learn_from_batch(batch) if self.tp.learning_rate_decay_rate != 0: self.curr_learning_rate.add_sample( self.tp.sess.run(self.tp.learning_rate)) else: self.curr_learning_rate.add_sample(self.tp.learning_rate) # update the target network of every network that has a target network if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0: for network in self.networks: network.update_target_network( self.tp.agent.rate_for_copying_weights_to_target) logger.create_signal_value('Update Target Network', 1) else: logger.create_signal_value('Update Target Network', 0, overwrite=False) return loss def extract_batch(self, batch): """ Extracts a single numpy array for each object in a batch of transitions (state, action, etc.) :param batch: An array of transitions :return: For each transition element, returns a numpy array of all the transitions in the batch """ current_states = {} next_states = {} current_states['observation'] = np.array( [transition.state['observation'] for transition in batch]) next_states['observation'] = np.array( [transition.next_state['observation'] for transition in batch]) actions = np.array([transition.action for transition in batch]) rewards = np.array([transition.reward for transition in batch]) game_overs = np.array([transition.game_over for transition in batch]) total_return = np.array( [transition.total_return for transition in batch]) # get the entire state including measurements if available if self.tp.agent.use_measurements: current_states['measurements'] = np.array( [transition.state['measurements'] for transition in batch]) next_states['measurements'] = np.array([ transition.next_state['measurements'] for transition in batch ]) return current_states, next_states, actions, rewards, game_overs, total_return def plot_action_values_online(self): """ Plot an animated graph of the value of each possible action during the episode :return: None """ plt.clf() for key, data_list in self.episode_running_info.items(): plt.plot(data_list, label=key) plt.legend() plt.pause(0.00000001) def choose_action(self, curr_state, phase=RunPhase.TRAIN): """ choose an action to act with in the current episode being played. Different behavior might be exhibited when training or testing. :param curr_state: the current state to act upon. :param phase: the current phase: training or testing. :return: chosen action, some action value describing the action (q-value, probability, etc) """ pass def preprocess_reward(self, reward): if self.tp.env.reward_scaling: reward /= float(self.tp.env.reward_scaling) if self.tp.env.reward_clipping_max: reward = min(reward, self.tp.env.reward_clipping_max) if self.tp.env.reward_clipping_min: reward = max(reward, self.tp.env.reward_clipping_min) return reward def tf_input_state(self, curr_state): """ convert curr_state into input tensors tensorflow is expecting. """ # add batch axis with length 1 onto each value # extract values from the state based on agent.input_types input_state = {} for input_name in self.tp.agent.input_types.keys(): input_state[input_name] = np.expand_dims( np.array(curr_state[input_name]), 0) return input_state def act(self, phase=RunPhase.TRAIN): """ Take one step in the environment according to the network prediction and store the transition in memory :param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored :return: A boolean value that signals an episode termination """ self.total_steps_counter += 1 self.current_episode_steps_counter += 1 # get new action action_info = { "action_probability": 1.0 / self.env.action_space_size, "action_value": 0 } is_first_transition_in_episode = (self.curr_state == {}) if is_first_transition_in_episode: if not isinstance(self.env.state, dict): raise ValueError( ('expected state to be a dictionary, found {}').format( type(self.env.state))) state = self.env.state # TODO: modify preprocess_observation to modify the entire state # for now, only preprocess the observation state['observation'] = self.preprocess_observation( state['observation']) # TODO: provide option to stack more than just the observation # TODO: this should probably be happening in an environment wrapper anyway state['observation'] = stack_observation( [], state['observation'], self.tp.env.observation_stack_size) self.curr_state = state if self.tp.agent.use_measurements: # TODO: this should be handled in the environment self.curr_state['measurements'] = self.env.measurements if self.tp.agent.use_accumulated_reward_as_measurement: self.curr_state['measurements'] = np.append( self.curr_state['measurements'], 0) if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions: action = self.env.get_random_action() else: action, action_info = self.choose_action(self.curr_state, phase=phase) # perform action if type(action) == np.ndarray: action = action.squeeze() result = self.env.step(action) shaped_reward = self.preprocess_reward(result['reward']) if 'action_intrinsic_reward' in action_info.keys(): shaped_reward += action_info['action_intrinsic_reward'] # TODO: should total_reward_in_current_episode include shaped_reward? self.total_reward_in_current_episode += result['reward'] next_state = result['state'] next_state['observation'] = self.preprocess_observation( next_state['observation']) # plot action values online if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP: self.plot_action_values_online() # initialize the next state # TODO: provide option to stack more than just the observation next_state['observation'] = stack_observation( self.curr_state['observation'], next_state['observation'], self.tp.env.observation_stack_size) if self.tp.agent.use_measurements and 'measurements' in result.keys(): next_state['measurements'] = result['state']['measurements'] if self.tp.agent.use_accumulated_reward_as_measurement: next_state['measurements'] = np.append( next_state['measurements'], self.total_reward_in_current_episode) # store the transition only if we are training if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: transition = Transition(self.curr_state, result['action'], shaped_reward, next_state, result['done']) for key in action_info.keys(): transition.info[key] = action_info[key] if self.tp.agent.add_a_normalized_timestep_to_the_observation: transition.info['timestep'] = float( self.current_episode_steps_counter ) / self.env.timestep_limit self.memory.store(transition) elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs: # we store the transitions only for saving gifs self.last_episode_images.append(self.env.get_rendered_image()) # update the current state for the next step self.curr_state = next_state # deal with episode termination if result['done']: if self.tp.visualization.dump_csv: self.update_log(phase=phase) self.log_to_screen(phase=phase) if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP: self.reset_game() self.current_episode += 1 self.tp.current_episode = self.current_episode # return episode really ended return result['done'] def evaluate(self, num_episodes, keep_networks_synced=False): """ Run in an evaluation mode for several episodes. Actions will be chosen greedily. :param keep_networks_synced: keep the online network in sync with the global network after every episode :param num_episodes: The number of episodes to evaluate on :return: None """ max_reward_achieved = -float('inf') average_evaluation_reward = 0 screen.log_title("Running evaluation") self.env.change_phase(RunPhase.TEST) for i in range(num_episodes): # keep the online network in sync with the global network if keep_networks_synced: for network in self.networks: network.sync() episode_ended = False while not episode_ended: episode_ended = self.act(phase=RunPhase.TEST) if keep_networks_synced \ and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps: for network in self.networks: network.sync() if self.total_reward_in_current_episode > max_reward_achieved: max_reward_achieved = self.total_reward_in_current_episode frame_skipping = int(5 / self.tp.env.frame_skip) if self.tp.visualization.dump_gifs: logger.create_gif( self.last_episode_images[::frame_skipping], name='score-{}'.format(max_reward_achieved), fps=10) average_evaluation_reward += self.total_reward_in_current_episode self.reset_game() average_evaluation_reward /= float(num_episodes) self.env.change_phase(RunPhase.TRAIN) screen.log_title("Evaluation done. Average reward = {}.".format( average_evaluation_reward)) def post_training_commands(self): pass def improve(self): """ Training algorithms wrapper. Heatup >> [ Evaluate >> Play >> Train >> Save checkpoint ] :return: None """ # synchronize the online network weights with the global network for network in self.networks: network.sync() # heatup phase if self.tp.num_heatup_steps != 0: self.in_heatup = True screen.log_title("Starting heatup {}".format(self.task_id)) num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size for step in range( max(self.tp.num_heatup_steps, num_steps_required_for_one_training_batch)): self.act(phase=RunPhase.HEATUP) # training phase self.in_heatup = False screen.log_title("Starting training {}".format(self.task_id)) self.exploration_policy.change_phase(RunPhase.TRAIN) training_start_time = time.time() model_snapshots_periods_passed = -1 while self.training_iteration < self.tp.num_training_iterations: # evaluate evaluate_agent = (self.last_episode_evaluation_ran is not self.current_episode) and \ (self.current_episode % self.tp.evaluate_every_x_episodes == 0) evaluate_agent = evaluate_agent or \ (self.imitation and self.training_iteration > 0 and self.training_iteration % self.tp.evaluate_every_x_training_iterations == 0) if evaluate_agent: self.env.reset() self.last_episode_evaluation_ran = self.current_episode self.evaluate(self.tp.evaluation_episodes) # snapshot model if self.tp.save_model_sec and self.tp.save_model_sec > 0 and not self.tp.distributed: total_training_time = time.time() - training_start_time current_snapshot_period = (int(total_training_time) // self.tp.save_model_sec) if current_snapshot_period > model_snapshots_periods_passed: model_snapshots_periods_passed = current_snapshot_period self.save_model(model_snapshots_periods_passed) # play and record in replay buffer if self.tp.agent.collect_new_data: if self.tp.agent.step_until_collecting_full_episodes: step = 0 while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode( -1).length() != 0: self.act() step += 1 else: for step in range( self.tp.agent.num_consecutive_playing_steps): self.act() # train if self.tp.train: for step in range( self.tp.agent.num_consecutive_training_steps): loss = self.train() self.loss.add_sample(loss) self.training_iteration += 1 if self.imitation: self.log_to_screen(RunPhase.TRAIN) self.post_training_commands() def save_model(self, model_id): self.main_network.save_model(model_id)
class NAFAgent(ValueOptimizationAgent): def __init__(self, env, tuning_parameters, replicated_device=None, thread_id=0): ValueOptimizationAgent.__init__(self, env, tuning_parameters, replicated_device, thread_id) self.l_values = Signal("L") self.a_values = Signal("Advantage") self.mu_values = Signal("Action") self.v_values = Signal("V") self.signals += [ self.l_values, self.a_values, self.mu_values, self.v_values ] def learn_from_batch(self, batch): current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch( batch) # TD error = r + discount*v_st_plus_1 - q_st v_st_plus_1 = self.main_network.target_network.predict( next_states, self.main_network.target_network.output_heads[0].V, squeeze_output=False, ) TD_targets = np.expand_dims(rewards, -1) + (1.0 - np.expand_dims( game_overs, -1)) * self.tp.agent.discount * v_st_plus_1 if len(actions.shape) == 1: actions = np.expand_dims(actions, -1) result = self.main_network.train_and_sync_networks( { **current_states, 'output_0_0': actions }, TD_targets) total_loss = result[0] return total_loss def choose_action(self, curr_state, phase=RunPhase.TRAIN): assert not self.env.discrete_controls, 'NAF works only for continuous control problems' # convert to batch so we can run it through the network # observation = np.expand_dims(np.array(curr_state['observation']), 0) naf_head = self.main_network.online_network.output_heads[0] action_values = self.main_network.online_network.predict( self.tf_input_state(curr_state), outputs=naf_head.mu, squeeze_output=False, ) if phase == RunPhase.TRAIN: action = self.exploration_policy.get_action(action_values) else: action = action_values Q, L, A, mu, V = self.main_network.online_network.predict( { **self.tf_input_state(curr_state), 'output_0_0': action_values }, outputs=[ naf_head.Q, naf_head.L, naf_head.A, naf_head.mu, naf_head.V ], ) # store the q values statistics for logging self.q_values.add_sample(Q) self.l_values.add_sample(L) self.a_values.add_sample(A) self.mu_values.add_sample(mu) self.v_values.add_sample(V) action_value = {"action_value": Q} return action, action_value
def __init__(self): self.messages = [] self.sig_changed = Signal() CommManager().register_model(self)
def __init__(self): self.last_message = None self.sig_changed = Signal()
def __init__(self): self.parameters = {} self.missing_ids = set() self._event = threading.Event() self.sig_changed = Signal() CommManager().register_model(self)
class RaceManager: testSpeedRatio = 1 def __init__(self): self.fleets = [] self.fleetsById = {} self.changed = Signal() self.finishes = [] self.finishesById = {} # we store these on the race manager so that they get pickled self.nextFleetId = 1 self.nextFinishId = 1 # # this method controls how the RaceManager is pickled. We want to avoid pickling the Signal object # stored on the changed attribute # def __getstate__(self): attributes = self.__dict__.copy() del attributes["changed"] return attributes # # this method controls how the RaceManager is unpickled. We need to set the changed attribute # as it is not part of the pickle # def __setstate__(self,d): self.__dict__ = d self.changed = Signal() def incrementNextFleetId(self): self.nextFleetId = self.nextFleetId + 1 def incrementNextFinishId(self): self.nextFinishId = self.nextFinishId + 1 def adjustedSeconds(self,unadjustedSeconds): return unadjustedSeconds * RaceManager.testSpeedRatio def unadjustedSecond(self,adjustedSeconds): return adjustedSeconds / RaceManager.testSpeedRatio # # Create a fleet, add to our fleets and return the fleet. If the name is not specified, # we create a name as 'Fleet N' where N is the number of fleets. # def createFleet(self, name=None): aFleet = Fleet(name=name,fleetId=self.nextFleetId) self.incrementNextFleetId() self.addFleet(aFleet) return aFleet def fleetWithId(self,fleetId): if fleetId in self.fleetsById: return self.fleetsById[fleetId] def addFleet(self, aFleet): self.fleets.append(aFleet) self.fleetsById[aFleet.fleetId] = aFleet self.changed.fire("fleetAdded",aFleet) def removeFleet(self, aFleet): if aFleet in self.fleets: positionInList = self.fleets.index(aFleet) self.fleets.remove(aFleet) del self.fleetsById[aFleet.fleetId] self.changed.fire("fleetRemoved",aFleet) else: raise RaceException("Fleet not found",aFleet) def numberFleets(self): return len(self.fleets) def hasFleets(self): return self.numberFleets() > 0 # # Start our race sequence in ten seconds with a five minute warning before the first # fleet, i.e. 10 minutes to the first fleet start. This is F flag start # def startRaceSequenceWithWarning(self): logging.info("Start sequence with warning (F flag start)") fleetNumber = 0 now = datetime.now() sequenceStart = now + timedelta(seconds=10) for fleet in self.fleets: fleetNumber = fleetNumber + 1 startTime = sequenceStart + timedelta( seconds = (WARNING_SECONDS/RaceManager.testSpeedRatio + (START_SECONDS * fleetNumber)/RaceManager.testSpeedRatio)) self.updateFleetStartTime(fleet,startTime) self.changed.fire("sequenceStartedWithWarning") # # Start our race sequence without a warning (i.e. class start) # def startRaceSequenceWithoutWarning(self): logging.info("Start sequence without warning (class flag start)") fleetNumber = 0 now = datetime.now() sequenceStart = now + timedelta(seconds=10) for fleet in self.fleets: fleetNumber = fleetNumber + 1 startTime = sequenceStart + timedelta( seconds = (START_SECONDS * fleetNumber)/RaceManager.testSpeedRatio) self.updateFleetStartTime(fleet,startTime) self.changed.fire("sequenceStartedWithoutWarning") # # Update the startTime for a fleet. Do this through the race manager # so that the race manager can signal the event change # def updateFleetStartTime(self, aFleet, startTime): aFleet.startTime = startTime # signal that the fleet start time has changed self.changed.fire("fleetChanged",aFleet) # # Find the last fleet started. This is a reverse search # of the fleets list for a started fleet. # Returns None if not found # def lastFleetStarted(self): for fleet in reversed(self.fleets): if fleet.isStarted(): return fleet return None # # Fine the next fleet to start. If we don't have a fleet starting, # return None. # def nextFleetToStart(self): for fleet in self.fleets: if fleet.isStarting() or fleet.isWaitingToStart(): return fleet return None def hasStartedFleet(self): return not self.lastFleetStarted() is None def hasSequenceStarted(self): if self.nextFleetToStart(): return True else: return False # # Reset start sequence - set all fleets to no start time, and fire a signal # def resetStartSequence(self): for fleet in self.fleets: fleet.startTime = None self.removeAllFinishes() self.changed.fire("startSequenceReset") def lastFleet(self): return self.fleets[-1] # # Perform a general recall. This is always for the fleet that # has most recently started # def generalRecall(self): logging.info("General recall") fleetToRecall = self.lastFleetStarted() # if this is the last (or only) fleet, set its start time to be six # minutes from now if fleetToRecall == self.fleets[-1]: logging.info("General recall last fleet") self.updateFleetStartTime(fleetToRecall,datetime.now() + timedelta(seconds=(START_SECONDS+LAST_START_GENERAL_RECALL_DELAY)/RaceManager.testSpeedRatio)) # otherwise kick the fleet to be the back of the queue, # with a start time five minutes after the last fleet else: self.removeFleet(fleetToRecall) lastFleet = self.fleets[-1] self.updateFleetStartTime(fleetToRecall, lastFleet.startTime + timedelta(seconds=START_SECONDS/RaceManager.testSpeedRatio)) self.addFleet(fleetToRecall) logging.log(logging.INFO, "General recall not last fleet. Moving to back of queue. Delta to start time now %d seconds", fleetToRecall.adjustedDeltaSecondsToStartTime()) self.changed.fire("generalRecall", fleetToRecall) # # Create a finish and add it to the race manager's list of finishes. # This method returns a finish object. By default, the finish object will # have a finish time of now and no fleet. # def createFinish(self, fleet=None, finishTime=None): # if no finish time is supplied, set the finish time to be now if not finishTime: finishTime = datetime.now() # create the finish object # if we only have one fleet, this will be the fleet for the finish if self.numberFleets() == 1: fleet = self.fleets[0] aFinish = Finish(fleet=fleet,finishTime=finishTime,finishId=self.nextFinishId) self.incrementNextFinishId() self.addFinish(aFinish) return aFinish def addFinish(self,finish): # add it to our list of finish objects self.finishes.append(finish) self.finishesById[finish.finishId] = finish # fire a change signal self.changed.fire("finishAdded",finish) def removeFinish(self,finish): self.finishes.remove(finish) del self.finishesById[finish.finishId] self.changed.fire("finishRemoved",finish) def updateFinish(self,finish): self.changed.fire("finishChanged",finish) def removeAllFinishes(self): for finish in list(self.finishes): self.removeFinish(finish) def finishWithId(self,finishId): if finishId in self.finishesById: return self.finishesById[finishId]
def __init__(self, env, tuning_parameters, replicated_device=None, task_id=0): """ :param env: An environment instance :type env: EnvironmentWrapper :param tuning_parameters: A Preset class instance with all the running paramaters :type tuning_parameters: Preset :param replicated_device: A tensorflow device for distributed training (optional) :type replicated_device: instancemethod :param thread_id: The current thread id :param thread_id: int """ screen.log_title("Creating agent {}".format(task_id)) self.task_id = task_id self.sess = tuning_parameters.sess self.env = tuning_parameters.env_instance = env self.imitation = False # i/o dimensions if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height: tuning_parameters.env.desired_observation_width = self.env.width tuning_parameters.env.desired_observation_height = self.env.height self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size if tuning_parameters.agent.use_accumulated_reward_as_measurement: self.measurements_size = tuning_parameters.env.measurements_size = ( self.measurements_size[0] + 1, ) # modules if tuning_parameters.agent.load_memory_from_file_path: screen.log_title( "Loading replay buffer from pickle. Pickle path: {}".format( tuning_parameters.agent.load_memory_from_file_path)) self.memory = read_pickle( tuning_parameters.agent.load_memory_from_file_path) else: self.memory = eval(tuning_parameters.memory + '(tuning_parameters)') # self.architecture = eval(tuning_parameters.architecture) self.has_global = replicated_device is not None self.replicated_device = replicated_device self.worker_device = "/job:worker/task:{}/cpu:0".format( task_id) if replicated_device is not None else "/gpu:0" self.exploration_policy = eval(tuning_parameters.exploration.policy + '(tuning_parameters)') self.evaluation_exploration_policy = eval( tuning_parameters.exploration.evaluation_policy + '(tuning_parameters)') self.evaluation_exploration_policy.change_phase(RunPhase.TEST) # initialize all internal variables self.tp = tuning_parameters self.in_heatup = False self.total_reward_in_current_episode = 0 self.total_steps_counter = 0 self.running_reward = None self.training_iteration = 0 self.current_episode = self.tp.current_episode = 0 self.curr_state = {} self.current_episode_steps_counter = 0 self.episode_running_info = {} self.last_episode_evaluation_ran = 0 self.running_observations = [] logger.set_current_time(self.current_episode) self.main_network = None self.networks = [] self.last_episode_images = [] self.renderer = Renderer() # signals self.signals = [] self.loss = Signal('Loss') self.signals.append(self.loss) self.curr_learning_rate = Signal('Learning Rate') self.signals.append(self.curr_learning_rate) if self.tp.env.normalize_observation and not self.env.is_state_type_image: if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers: self.running_observation_stats = RunningStat( (self.tp.env.desired_observation_width, )) self.running_reward_stats = RunningStat(()) else: self.running_observation_stats = SharedRunningStats( self.tp, replicated_device, shape=(self.tp.env.desired_observation_width, ), name='observation_stats') self.running_reward_stats = SharedRunningStats( self.tp, replicated_device, shape=(), name='reward_stats') # env is already reset at this point. Otherwise we're getting an error where you cannot # reset an env which is not done self.reset_game(do_not_reset_env=True) # use seed if self.tp.seed is not None: random.seed(self.tp.seed) np.random.seed(self.tp.seed)
def __setstate__(self,d): self.__dict__ = d self.changed = Signal()
class Database: __slots__ = ['_path', '_signal', '_db'] Event = namedtuple('Event', ['type', 'feed', 'link']) def __init__(self, path: str): self._path = path self._signal = Signal() self._db = self._load_db() def __getitem__(self, key): return self._db[key] def feeds(self): return self._db.keys() def channels(self): return (ch for chs in self._db.values() for ch in chs.values()) def channels_of_feed(self, feed: str, links: bool = True): origin = self._db[feed].keys if links else self._db[feed].values return (ch for ch in origin()) def add_feed(self, link: str): self._db[link] = {} self.flush() def add_channel(self, link: str, last_id: int, feed: str): self._db[feed][link] = Channel(link, last_id, feed) self.flush() self._publish(self.Event('add', feed, link)) def remove_channel(self, link: str, feed: str): del self._db[feed][link] self.flush() self._publish(self.Event('remove', feed, link)) def remove_feed(self, feed: str): for link in self._db[feed].keys(): self._publish(self.Event('remove', feed, link)) del self._db[feed] self.flush() def feed_nonempty(self, feed: str): return bool(self._db[feed].keys()) def feed_exists(self, link: str): return link in self._db.keys() def channel_exists(self, link: str, feed: str = None): return link in self._db[feed].keys() if feed else any( link in chs.keys() for chs in self._db.values()) def flush(self): with open(self._path, 'w') as db: json.dump(self._db, db) def subscribe(self, receiver: Callable[[Database.Event], None]): self._signal.connect(receiver) def _publish(self, event: Database.Event): self._signal(event) def _load_db(self) -> Dict[str, Dict[str, Channel]]: try: with open(self._path) as db: _db = json.load(db) for channels in _db.values(): for link in channels.keys(): channels[link] = Channel(*channels[link]) return _db except FileNotFoundError: return {}