Example #1
0
class FileWatcher(object):
    EVENT_MODIFIED = 2
    EVENT_RENAMED = 2048

    def __init__(self, path):
        self.path = path
        self.open_file()
        self.bind_events()
        self.on_more_content = Signal()

    def on_file_changed(self, ignored, filepath, mask):
        handler = {self.EVENT_MODIFIED: self.read_more, self.EVENT_RENAMED: self.rebind_events}.get(mask)
        if not handler:
            print "Ignoring event %s (%s)" % (mask, inotify.humanReadableMask(mask))
            return
        return handler()

    def read_more(self):
        content = self.file.read()
        self.on_more_content.emit(content)

    def rebind_events(self):
        self.open_file()
        self.bind_events()

    def open_file(self):
        self.file = open(self.path, "r")

    def bind_events(self):
        self.notifier = inotify.INotify()
        self.notifier.startReading()
        self.notifier.watch(filepath.FilePath(self.path), callbacks=[self.on_file_changed])
Example #2
0
class StatusTextManager(object):
    def __init__(self):
        self.messages = []
        self.sig_changed = Signal()
        CommManager().register_model(self)

    @property
    def last_message(self):
        return self.messages[-1]

    def add_message(self, msg):
        self.messages.append((time(), msg))
        if msg.severity == msgs.StatusText.DEBUG:
            ecu_log.debug(msg.text)
        elif msg.severity == msgs.StatusText.INFO:
            ecu_log.info(msg.text)
        elif msg.severity == msgs.StatusText.WARN:
            ecu_log.warn(msg.text)
        elif msg.severity == msgs.StatusText.ERROR:
            ecu_log.error(msg.text)
        elif msg.severity == msgs.StatusText.FAILURE:
            ecu_log.fatal(msg.text)
        else:
            ecu_log.debug("SEV(%s): %s", msg.severity, msg.text)

        self.sig_changed.emit()

    def clear(self):
        self.messages = []
Example #3
0
def convert_samples_to_signals(samples: List[Sample]) -> List[Signal]:
    signals = []
    for index in range(len(samples) - 1):
        length = samples[index + 1].timestamp - samples[index].timestamp
        value = samples[index].value
        signals.append(Signal(length=length, value=value))
    signals.append(Signal(length=signals[-1].length, value=samples[-1].value))
    return signals
Example #4
0
class StatusManager(object):
    def __init__(self):
        self.last_message = None
        self.sig_changed = Signal()

    def update_status(self, msg):
        self.last_message = msg
        self.sig_changed.emit()

    def clear(self):
        pass
Example #5
0
def _extract_single_signal(
        samples: List[Sample]) -> Tuple[Signal, List[Sample]]:
    value = samples[0].value
    start_time = samples[0].timestamp
    for index in range(1, len(samples)):
        diff = samples[index].timestamp - samples[index - 1].timestamp
        if diff > MIN_DIFF_BETWEEN_SIGNALS:
            signal = Signal(value=value,
                            length=samples[index - 1].timestamp - start_time)
            return signal, samples[index - 1:]

    return Signal(value=value, length=samples[-1].timestamp - start_time), []
Example #6
0
 def __init__(self,
              env,
              tuning_parameters,
              replicated_device=None,
              thread_id=0):
     ValueOptimizationAgent.__init__(self,
                                     env,
                                     tuning_parameters,
                                     replicated_device,
                                     thread_id,
                                     create_target_network=True)
     self.last_gradient_update_step_idx = 0
     self.q_values = Signal('Q Values')
     self.unclipped_grads = Signal('Grads (unclipped)')
     self.value_loss = Signal('Value Loss')
     self.signals.append(self.q_values)
     self.signals.append(self.unclipped_grads)
     self.signals.append(self.value_loss)
Example #7
0
    def __init__(self,
                 env,
                 tuning_parameters,
                 replicated_device=None,
                 thread_id=0,
                 create_target_network=True):
        Agent.__init__(self, env, tuning_parameters, replicated_device,
                       thread_id)
        self.main_network = NetworkWrapper(tuning_parameters,
                                           create_target_network,
                                           self.has_global, 'main',
                                           self.replicated_device,
                                           self.worker_device)
        self.networks.append(self.main_network)
        self.q_values = Signal("Q")
        self.signals.append(self.q_values)

        self.reset_game(do_not_reset_env=True)
Example #8
0
 def __init__(self):
     self.fleets = []
     self.fleetsById = {}
     self.changed = Signal()
     self.finishes = []
     self.finishesById = {}
     # we store these on the race manager so that they get pickled
     self.nextFleetId = 1
     self.nextFinishId = 1
Example #9
0
def join_signals(signals: List[Signal]) -> List[Signal]:
    stack = []
    grouped_signals = []
    for signal in signals:
        if signal.length < MIN_DIFF_BETWEEN_SIGNALS:
            stack.append(signal)
        else:
            if stack:
                new_signal = Signal(value=stack[0].value,
                                    length=sum([s.length for s in stack]))
                grouped_signals.append(new_signal)
                stack = []
            grouped_signals.append(signal)

    new_signal = Signal(value=stack[0].value,
                        length=sum([s.length for s in stack]))
    grouped_signals.append(new_signal)
    return grouped_signals
def merge_same_signals_into_one(signals: List[int],
                                frequency: int) -> List[Signal]:
    grouped_signals = []
    for signal_value, signal_group in groupby(signals):
        signal_length = frequency_to_milliseconds(frequency) * len(
            list(signal_group))
        grouped_signals.append(Signal(length=signal_length,
                                      value=signal_value))
    return grouped_signals
Example #11
0
 def __init__(self,
              env,
              tuning_parameters,
              replicated_device=None,
              thread_id=0):
     ValueOptimizationAgent.__init__(self, env, tuning_parameters,
                                     replicated_device, thread_id)
     self.l_values = Signal("L")
     self.a_values = Signal("Advantage")
     self.mu_values = Signal("Action")
     self.v_values = Signal("V")
     self.signals += [
         self.l_values, self.a_values, self.mu_values, self.v_values
     ]
Example #12
0
class ValueOptimizationAgent(Agent):
    def __init__(self,
                 env,
                 tuning_parameters,
                 replicated_device=None,
                 thread_id=0,
                 create_target_network=True):
        Agent.__init__(self, env, tuning_parameters, replicated_device,
                       thread_id)
        self.main_network = NetworkWrapper(tuning_parameters,
                                           create_target_network,
                                           self.has_global, 'main',
                                           self.replicated_device,
                                           self.worker_device)
        self.networks.append(self.main_network)
        self.q_values = Signal("Q")
        self.signals.append(self.q_values)

        self.reset_game(do_not_reset_env=True)

    # Algorithms for which q_values are calculated from predictions will override this function
    def get_q_values(self, prediction):
        return prediction

    def get_prediction(self, curr_state):
        return self.main_network.online_network.predict(
            self.tf_input_state(curr_state))

    def _validate_action(self, policy, action):
        if np.array(action).shape != ():
            raise ValueError(
                ('The exploration_policy {} returned a vector of actions '
                 'instead of a single action. ValueOptimizationAgents '
                 'require exploration policies which return a single action.'
                 ).format(policy.__class__.__name__))

    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
        prediction = self.get_prediction(curr_state)
        actions_q_values = self.get_q_values(prediction)

        # choose action according to the exploration policy and the current phase (evaluating or training the agent)
        if phase == RunPhase.TRAIN:
            exploration_policy = self.exploration_policy
        else:
            exploration_policy = self.evaluation_exploration_policy

        action = exploration_policy.get_action(actions_q_values)
        self._validate_action(exploration_policy, action)

        # this is for bootstrapped dqn
        if type(actions_q_values) == list and len(actions_q_values) > 0:
            actions_q_values = actions_q_values[
                self.exploration_policy.selected_head]
        actions_q_values = actions_q_values.squeeze()

        # store the q values statistics for logging
        self.q_values.add_sample(actions_q_values)

        # store information for plotting interactively (actual plotting is done in agent)
        if self.tp.visualization.plot_action_values_online:
            for idx, action_name in enumerate(self.env.actions_description):
                self.episode_running_info[action_name].append(
                    actions_q_values[idx])

        action_value = {"action_value": actions_q_values[action]}
        return action, action_value
Example #13
0
 def __init__(self, path):
     self.path = path
     self.open_file()
     self.bind_events()
     self.on_more_content = Signal()
Example #14
0
input_all = data[:, 1] * 5.86 * 0.01  # v_in: volts
freq_all = data[:, 2]
t_all = np.array([i * 0.002 for i, _ in enumerate(output_all)])

output = []
input = []
t = []
signal = []

for f in np.unique(freq_all):
    idxs = np.argwhere(freq_all == f)
    output.append(output_all[idxs].flatten())
    input.append(input_all[idxs].flatten())
    t.append(np.array([i * 0.01 for i, _ in enumerate(output[-1])]))
    try:
        signal.append(Signal(t[-1], input[-1], output[-1]))
    except:
        print(f)
        break

fig = plt.figure(figsize=(9, 5))
ax1 = fig.add_subplot(111)
ax1.plot(t_all, output_all, 'b')
ax1.plot(t_all, input_all, 'r')
ax1.grid()

fig2 = plt.figure(figsize=(9, 5))
ax2 = fig2.add_subplot(211)
ax3 = fig2.add_subplot(212)

w = []
Example #15
0
class NStepQAgent(ValueOptimizationAgent, PolicyOptimizationAgent):
    def __init__(self,
                 env,
                 tuning_parameters,
                 replicated_device=None,
                 thread_id=0):
        ValueOptimizationAgent.__init__(self,
                                        env,
                                        tuning_parameters,
                                        replicated_device,
                                        thread_id,
                                        create_target_network=True)
        self.last_gradient_update_step_idx = 0
        self.q_values = Signal('Q Values')
        self.unclipped_grads = Signal('Grads (unclipped)')
        self.value_loss = Signal('Value Loss')
        self.signals.append(self.q_values)
        self.signals.append(self.unclipped_grads)
        self.signals.append(self.value_loss)

    def learn_from_batch(self, batch):
        # batch contains a list of episodes to learn from
        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(
            batch)

        # get the values for the current states
        state_value_head_targets = self.main_network.online_network.predict(
            current_states)

        # the targets for the state value estimator
        num_transitions = len(game_overs)

        if self.tp.agent.targets_horizon == '1-Step':
            # 1-Step Q learning
            q_st_plus_1 = self.main_network.target_network.predict(next_states)

            for i in reversed(range(num_transitions)):
                state_value_head_targets[i][actions[i]] = \
                    rewards[i] + (1.0 - game_overs[i]) * self.tp.agent.discount * np.max(q_st_plus_1[i], 0)

        elif self.tp.agent.targets_horizon == 'N-Step':
            # N-Step Q learning
            if game_overs[-1]:
                R = 0
            else:
                R = np.max(
                    self.main_network.target_network.predict(
                        last_sample(next_states)))

            for i in reversed(range(num_transitions)):
                R = rewards[i] + self.tp.agent.discount * R
                state_value_head_targets[i][actions[i]] = R

        else:
            assert True, 'The available values for targets_horizon are: 1-Step, N-Step'

        # train
        result = self.main_network.online_network.accumulate_gradients(
            current_states, [state_value_head_targets])

        # logging
        total_loss, losses, unclipped_grads = result[:3]
        self.unclipped_grads.add_sample(unclipped_grads)
        self.value_loss.add_sample(losses[0])

        return total_loss

    def train(self):
        # update the target network of every network that has a target network
        if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
            for network in self.networks:
                network.update_target_network(
                    self.tp.agent.rate_for_copying_weights_to_target)
            logger.create_signal_value('Update Target Network', 1)
        else:
            logger.create_signal_value('Update Target Network',
                                       0,
                                       overwrite=False)

        return PolicyOptimizationAgent.train(self)
Example #16
0
class ParamManager(object):
    def __init__(self):
        self.parameters = {}
        self.missing_ids = set()
        self._event = threading.Event()
        self.sig_changed = Signal()
        CommManager().register_model(self)

    @property
    def changed(self):
        return [p for p in self.parameters.values() if p.is_changed]

    def clear(self):
        self.parameters.clear()
        self.sig_changed.emit()

    def update_param(self, param_id, param_index, param_count, value):
        if len(self.missing_ids) == 0:
            self.missing_ids.update(range(param_count))

        p = self.parameters.get(param_id)
        if p:
            p._value = value
            p._changed = False
            log.debug("Update: %s: %s", p.param_id, p.value)
        else:
            self.parameters[param_id] = Parameter(param_id, param_index, value)
            log.debug("Add: %s: %s", param_id, value)

        self.missing_ids.discard(param_index)
        if len(self.missing_ids) == 0:
            log.debug("Retrive done")
            self._event.set()

    def retrieve_all(self):
        self.missing_ids = set()
        self._event.clear()

        # request all
        CommManager().param_request()
        self._event.wait(10.0)

        # not nesessary: try to request missing params
        if len(self.missing_ids) > 0:
            log.warn("Missing %d parameters, trying to request.", len(self.missing_ids))
            self._event.clear()
            for idx in self.missing_ids:
                CommManager().param_request(param_index=idx)

            self._event.wait(10.0)

        if len(self.missing_ids):
            log.error("Missing %d parameters", len(self.missing_ids))

        self.sig_changed.emit()
        return len(self.missing_ids) == 0

    def sync(self):
        to_sync = self.changed
        if len(to_sync) == 0:
            log.info("Nothing to sync")
            self.sig_changed.emit()
            return True

        self.missing_ids = set((p.param_index for p in to_sync))
        self._event.clear()
        for p in to_sync:
            CommManager().param_set(p.param_id, p.value)

        self._event.wait(10.0)
        if len(self.missing_ids):
            log.error("Not synced %d parameters", len(self.missing_ids))

        self.sig_changed.emit()
        return len(self.missing_ids) == 0
Example #17
0
 def __init__(self, path: str):
     self._path = path
     self._signal = Signal()
     self._db = self._load_db()
Example #18
0
class Agent(object):
    def __init__(self,
                 env,
                 tuning_parameters,
                 replicated_device=None,
                 task_id=0):
        """
        :param env: An environment instance
        :type env: EnvironmentWrapper
        :param tuning_parameters: A Preset class instance with all the running paramaters
        :type tuning_parameters: Preset
        :param replicated_device: A tensorflow device for distributed training (optional)
        :type replicated_device: instancemethod
        :param thread_id: The current thread id
        :param thread_id: int
        """

        screen.log_title("Creating agent {}".format(task_id))
        self.task_id = task_id
        self.sess = tuning_parameters.sess
        self.env = tuning_parameters.env_instance = env
        self.imitation = False

        # i/o dimensions
        if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height:
            tuning_parameters.env.desired_observation_width = self.env.width
            tuning_parameters.env.desired_observation_height = self.env.height
        self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size
        self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size
        if tuning_parameters.agent.use_accumulated_reward_as_measurement:
            self.measurements_size = tuning_parameters.env.measurements_size = (
                self.measurements_size[0] + 1, )

        # modules
        if tuning_parameters.agent.load_memory_from_file_path:
            screen.log_title(
                "Loading replay buffer from pickle. Pickle path: {}".format(
                    tuning_parameters.agent.load_memory_from_file_path))
            self.memory = read_pickle(
                tuning_parameters.agent.load_memory_from_file_path)
        else:
            self.memory = eval(tuning_parameters.memory +
                               '(tuning_parameters)')
        # self.architecture = eval(tuning_parameters.architecture)

        self.has_global = replicated_device is not None
        self.replicated_device = replicated_device
        self.worker_device = "/job:worker/task:{}/cpu:0".format(
            task_id) if replicated_device is not None else "/gpu:0"

        self.exploration_policy = eval(tuning_parameters.exploration.policy +
                                       '(tuning_parameters)')
        self.evaluation_exploration_policy = eval(
            tuning_parameters.exploration.evaluation_policy +
            '(tuning_parameters)')
        self.evaluation_exploration_policy.change_phase(RunPhase.TEST)

        # initialize all internal variables
        self.tp = tuning_parameters
        self.in_heatup = False
        self.total_reward_in_current_episode = 0
        self.total_steps_counter = 0
        self.running_reward = None
        self.training_iteration = 0
        self.current_episode = self.tp.current_episode = 0
        self.curr_state = {}
        self.current_episode_steps_counter = 0
        self.episode_running_info = {}
        self.last_episode_evaluation_ran = 0
        self.running_observations = []
        logger.set_current_time(self.current_episode)
        self.main_network = None
        self.networks = []
        self.last_episode_images = []
        self.renderer = Renderer()

        # signals
        self.signals = []
        self.loss = Signal('Loss')
        self.signals.append(self.loss)
        self.curr_learning_rate = Signal('Learning Rate')
        self.signals.append(self.curr_learning_rate)

        if self.tp.env.normalize_observation and not self.env.is_state_type_image:
            if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
                self.running_observation_stats = RunningStat(
                    (self.tp.env.desired_observation_width, ))
                self.running_reward_stats = RunningStat(())
            else:
                self.running_observation_stats = SharedRunningStats(
                    self.tp,
                    replicated_device,
                    shape=(self.tp.env.desired_observation_width, ),
                    name='observation_stats')
                self.running_reward_stats = SharedRunningStats(
                    self.tp, replicated_device, shape=(), name='reward_stats')

        # env is already reset at this point. Otherwise we're getting an error where you cannot
        # reset an env which is not done
        self.reset_game(do_not_reset_env=True)

        # use seed
        if self.tp.seed is not None:
            random.seed(self.tp.seed)
            np.random.seed(self.tp.seed)

    def log_to_screen(self, phase):
        # log to screen
        if self.current_episode >= 0:
            if phase == RunPhase.TRAIN:
                exploration = self.exploration_policy.get_control_param()
            else:
                exploration = self.evaluation_exploration_policy.get_control_param(
                )

            screen.log_dict(OrderedDict([
                ("Worker", self.task_id), ("Episode", self.current_episode),
                ("total reward", self.total_reward_in_current_episode),
                ("exploration", exploration),
                ("steps", self.total_steps_counter),
                ("training iteration", self.training_iteration)
            ]),
                            prefix=phase)

    def update_log(self, phase=RunPhase.TRAIN):
        """
        Writes logging messages to screen and updates the log file with all the signal values.
        :return: None
        """
        # log all the signals to file
        logger.set_current_time(self.current_episode)
        logger.create_signal_value('Training Iter', self.training_iteration)
        logger.create_signal_value('In Heatup', int(phase == RunPhase.HEATUP))
        logger.create_signal_value('ER #Transitions',
                                   self.memory.num_transitions())
        logger.create_signal_value('ER #Episodes', self.memory.length())
        logger.create_signal_value('Episode Length',
                                   self.current_episode_steps_counter)
        logger.create_signal_value('Total steps', self.total_steps_counter)
        logger.create_signal_value("Epsilon",
                                   self.exploration_policy.get_control_param())
        logger.create_signal_value(
            "Training Reward", self.total_reward_in_current_episode
            if phase == RunPhase.TRAIN else np.nan)
        logger.create_signal_value(
            'Evaluation Reward', self.total_reward_in_current_episode
            if phase == RunPhase.TEST else np.nan)
        logger.create_signal_value('Update Target Network', 0, overwrite=False)
        logger.update_wall_clock_time(self.current_episode)

        for signal in self.signals:
            logger.create_signal_value("{}/Mean".format(signal.name),
                                       signal.get_mean())
            logger.create_signal_value("{}/Stdev".format(signal.name),
                                       signal.get_stdev())
            logger.create_signal_value("{}/Max".format(signal.name),
                                       signal.get_max())
            logger.create_signal_value("{}/Min".format(signal.name),
                                       signal.get_min())

        # dump
        if self.current_episode % self.tp.visualization.dump_signals_to_csv_every_x_episodes == 0 \
                and self.current_episode > 0:
            logger.dump_output_csv()

    def reset_game(self, do_not_reset_env=False):
        """
        Resets all the episodic parameters and start a new environment episode.
        :param do_not_reset_env: A boolean that allows prevention of environment reset
        :return: None
        """

        for signal in self.signals:
            signal.reset()
        self.total_reward_in_current_episode = 0
        self.curr_state = {}
        self.last_episode_images = []
        self.current_episode_steps_counter = 0
        self.episode_running_info = {}
        if not do_not_reset_env:
            self.env.reset()
        self.exploration_policy.reset()

        # required for online plotting
        if self.tp.visualization.plot_action_values_online:
            if hasattr(self, 'episode_running_info') and hasattr(
                    self.env, 'actions_description'):
                for action in self.env.actions_description:
                    self.episode_running_info[action] = []
            plt.clf()

        if self.tp.agent.middleware_type == MiddlewareTypes.LSTM:
            for network in self.networks:
                network.online_network.curr_rnn_c_in = network.online_network.middleware_embedder.c_init
                network.online_network.curr_rnn_h_in = network.online_network.middleware_embedder.h_init

    def preprocess_observation(self, observation):
        """
        Preprocesses the given observation.
        For images - convert to grayscale, resize and convert to int.
        For measurements vectors - normalize by a running average and std.
        :param observation: The agents observation
        :return: A processed version of the observation
        """

        if self.env.is_state_type_image:
            # rescale
            observation = scipy.misc.imresize(
                observation, (self.tp.env.desired_observation_height,
                              self.tp.env.desired_observation_width),
                interp=self.tp.rescaling_interpolation_type)
            # rgb to y
            if len(observation.shape) > 2 and observation.shape[2] > 1:
                r, g, b = observation[:, :,
                                      0], observation[:, :,
                                                      1], observation[:, :, 2]
                observation = 0.2989 * r + 0.5870 * g + 0.1140 * b

            # Render the processed observation which is how the agent will see it
            # Warning: this cannot currently be done in parallel to rendering the environment
            if self.tp.visualization.render_observation:
                if not self.renderer.is_open:
                    self.renderer.create_screen(observation.shape[0],
                                                observation.shape[1])
                self.renderer.render_image(observation)

            return observation.astype('uint8')
        else:
            if self.tp.env.normalize_observation:
                # standardize the input observation using a running mean and std
                if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
                    self.running_observation_stats.push(observation)
                observation = (observation - self.running_observation_stats.mean) / \
                              (self.running_observation_stats.std + 1e-15)
                observation = np.clip(observation, -5.0, 5.0)
            return observation

    def learn_from_batch(self, batch):
        """
        Given a batch of transitions, calculates their target values and updates the network.
        :param batch: A list of transitions
        :return: The loss of the training
        """
        pass

    def train(self):
        """
        A single training iteration. Sample a batch, train on it and update target networks.
        :return: The training loss.
        """
        batch = self.memory.sample(self.tp.batch_size)
        loss = self.learn_from_batch(batch)

        if self.tp.learning_rate_decay_rate != 0:
            self.curr_learning_rate.add_sample(
                self.tp.sess.run(self.tp.learning_rate))
        else:
            self.curr_learning_rate.add_sample(self.tp.learning_rate)

        # update the target network of every network that has a target network
        if self.total_steps_counter % self.tp.agent.num_steps_between_copying_online_weights_to_target == 0:
            for network in self.networks:
                network.update_target_network(
                    self.tp.agent.rate_for_copying_weights_to_target)
            logger.create_signal_value('Update Target Network', 1)
        else:
            logger.create_signal_value('Update Target Network',
                                       0,
                                       overwrite=False)

        return loss

    def extract_batch(self, batch):
        """
        Extracts a single numpy array for each object in a batch of transitions (state, action, etc.)
        :param batch: An array of transitions
        :return: For each transition element, returns a numpy array of all the transitions in the batch
        """
        current_states = {}
        next_states = {}

        current_states['observation'] = np.array(
            [transition.state['observation'] for transition in batch])
        next_states['observation'] = np.array(
            [transition.next_state['observation'] for transition in batch])
        actions = np.array([transition.action for transition in batch])
        rewards = np.array([transition.reward for transition in batch])
        game_overs = np.array([transition.game_over for transition in batch])
        total_return = np.array(
            [transition.total_return for transition in batch])

        # get the entire state including measurements if available
        if self.tp.agent.use_measurements:
            current_states['measurements'] = np.array(
                [transition.state['measurements'] for transition in batch])
            next_states['measurements'] = np.array([
                transition.next_state['measurements'] for transition in batch
            ])

        return current_states, next_states, actions, rewards, game_overs, total_return

    def plot_action_values_online(self):
        """
        Plot an animated graph of the value of each possible action during the episode
        :return: None
        """

        plt.clf()
        for key, data_list in self.episode_running_info.items():
            plt.plot(data_list, label=key)
        plt.legend()
        plt.pause(0.00000001)

    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
        """
        choose an action to act with in the current episode being played. Different behavior might be exhibited when training
         or testing.

        :param curr_state: the current state to act upon.
        :param phase: the current phase: training or testing.
        :return: chosen action, some action value describing the action (q-value, probability, etc)
        """
        pass

    def preprocess_reward(self, reward):
        if self.tp.env.reward_scaling:
            reward /= float(self.tp.env.reward_scaling)
        if self.tp.env.reward_clipping_max:
            reward = min(reward, self.tp.env.reward_clipping_max)
        if self.tp.env.reward_clipping_min:
            reward = max(reward, self.tp.env.reward_clipping_min)
        return reward

    def tf_input_state(self, curr_state):
        """
        convert curr_state into input tensors tensorflow is expecting.
        """
        # add batch axis with length 1 onto each value
        # extract values from the state based on agent.input_types
        input_state = {}
        for input_name in self.tp.agent.input_types.keys():
            input_state[input_name] = np.expand_dims(
                np.array(curr_state[input_name]), 0)
        return input_state

    def act(self, phase=RunPhase.TRAIN):
        """
        Take one step in the environment according to the network prediction and store the transition in memory
        :param phase: Either Train or Test to specify if greedy actions should be used and if transitions should be stored
        :return: A boolean value that signals an episode termination
        """

        self.total_steps_counter += 1
        self.current_episode_steps_counter += 1

        # get new action
        action_info = {
            "action_probability": 1.0 / self.env.action_space_size,
            "action_value": 0
        }
        is_first_transition_in_episode = (self.curr_state == {})
        if is_first_transition_in_episode:
            if not isinstance(self.env.state, dict):
                raise ValueError(
                    ('expected state to be a dictionary, found {}').format(
                        type(self.env.state)))

            state = self.env.state
            # TODO: modify preprocess_observation to modify the entire state
            # for now, only preprocess the observation
            state['observation'] = self.preprocess_observation(
                state['observation'])

            # TODO: provide option to stack more than just the observation
            # TODO: this should probably be happening in an environment wrapper anyway
            state['observation'] = stack_observation(
                [], state['observation'], self.tp.env.observation_stack_size)

            self.curr_state = state
            if self.tp.agent.use_measurements:
                # TODO: this should be handled in the environment
                self.curr_state['measurements'] = self.env.measurements

                if self.tp.agent.use_accumulated_reward_as_measurement:
                    self.curr_state['measurements'] = np.append(
                        self.curr_state['measurements'], 0)

        if phase == RunPhase.HEATUP and not self.tp.heatup_using_network_decisions:
            action = self.env.get_random_action()
        else:
            action, action_info = self.choose_action(self.curr_state,
                                                     phase=phase)

        # perform action
        if type(action) == np.ndarray:
            action = action.squeeze()
        result = self.env.step(action)

        shaped_reward = self.preprocess_reward(result['reward'])
        if 'action_intrinsic_reward' in action_info.keys():
            shaped_reward += action_info['action_intrinsic_reward']
        # TODO: should total_reward_in_current_episode include shaped_reward?
        self.total_reward_in_current_episode += result['reward']
        next_state = result['state']
        next_state['observation'] = self.preprocess_observation(
            next_state['observation'])

        # plot action values online
        if self.tp.visualization.plot_action_values_online and phase != RunPhase.HEATUP:
            self.plot_action_values_online()

        # initialize the next state
        # TODO: provide option to stack more than just the observation
        next_state['observation'] = stack_observation(
            self.curr_state['observation'], next_state['observation'],
            self.tp.env.observation_stack_size)

        if self.tp.agent.use_measurements and 'measurements' in result.keys():
            next_state['measurements'] = result['state']['measurements']
            if self.tp.agent.use_accumulated_reward_as_measurement:
                next_state['measurements'] = np.append(
                    next_state['measurements'],
                    self.total_reward_in_current_episode)

        # store the transition only if we are training
        if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
            transition = Transition(self.curr_state, result['action'],
                                    shaped_reward, next_state, result['done'])
            for key in action_info.keys():
                transition.info[key] = action_info[key]
            if self.tp.agent.add_a_normalized_timestep_to_the_observation:
                transition.info['timestep'] = float(
                    self.current_episode_steps_counter
                ) / self.env.timestep_limit
            self.memory.store(transition)
        elif phase == RunPhase.TEST and self.tp.visualization.dump_gifs:
            # we store the transitions only for saving gifs
            self.last_episode_images.append(self.env.get_rendered_image())

        # update the current state for the next step
        self.curr_state = next_state

        # deal with episode termination
        if result['done']:
            if self.tp.visualization.dump_csv:
                self.update_log(phase=phase)
            self.log_to_screen(phase=phase)

            if phase == RunPhase.TRAIN or phase == RunPhase.HEATUP:
                self.reset_game()

            self.current_episode += 1
            self.tp.current_episode = self.current_episode

        # return episode really ended
        return result['done']

    def evaluate(self, num_episodes, keep_networks_synced=False):
        """
        Run in an evaluation mode for several episodes. Actions will be chosen greedily.
        :param keep_networks_synced: keep the online network in sync with the global network after every episode
        :param num_episodes: The number of episodes to evaluate on
        :return: None
        """

        max_reward_achieved = -float('inf')
        average_evaluation_reward = 0
        screen.log_title("Running evaluation")
        self.env.change_phase(RunPhase.TEST)
        for i in range(num_episodes):
            # keep the online network in sync with the global network
            if keep_networks_synced:
                for network in self.networks:
                    network.sync()

            episode_ended = False
            while not episode_ended:
                episode_ended = self.act(phase=RunPhase.TEST)

                if keep_networks_synced \
                   and self.total_steps_counter % self.tp.agent.update_evaluation_agent_network_after_every_num_steps:
                    for network in self.networks:
                        network.sync()

            if self.total_reward_in_current_episode > max_reward_achieved:
                max_reward_achieved = self.total_reward_in_current_episode
                frame_skipping = int(5 / self.tp.env.frame_skip)
                if self.tp.visualization.dump_gifs:
                    logger.create_gif(
                        self.last_episode_images[::frame_skipping],
                        name='score-{}'.format(max_reward_achieved),
                        fps=10)

            average_evaluation_reward += self.total_reward_in_current_episode
            self.reset_game()

        average_evaluation_reward /= float(num_episodes)

        self.env.change_phase(RunPhase.TRAIN)
        screen.log_title("Evaluation done. Average reward = {}.".format(
            average_evaluation_reward))

    def post_training_commands(self):
        pass

    def improve(self):
        """
        Training algorithms wrapper. Heatup >> [ Evaluate >> Play >> Train >> Save checkpoint ]

        :return: None
        """

        # synchronize the online network weights with the global network
        for network in self.networks:
            network.sync()

        # heatup phase
        if self.tp.num_heatup_steps != 0:
            self.in_heatup = True
            screen.log_title("Starting heatup {}".format(self.task_id))
            num_steps_required_for_one_training_batch = self.tp.batch_size * self.tp.env.observation_stack_size
            for step in range(
                    max(self.tp.num_heatup_steps,
                        num_steps_required_for_one_training_batch)):
                self.act(phase=RunPhase.HEATUP)

        # training phase
        self.in_heatup = False
        screen.log_title("Starting training {}".format(self.task_id))
        self.exploration_policy.change_phase(RunPhase.TRAIN)
        training_start_time = time.time()
        model_snapshots_periods_passed = -1

        while self.training_iteration < self.tp.num_training_iterations:
            # evaluate
            evaluate_agent = (self.last_episode_evaluation_ran is not self.current_episode) and \
                             (self.current_episode % self.tp.evaluate_every_x_episodes == 0)
            evaluate_agent = evaluate_agent or \
                             (self.imitation and self.training_iteration > 0 and
                              self.training_iteration % self.tp.evaluate_every_x_training_iterations == 0)

            if evaluate_agent:
                self.env.reset()
                self.last_episode_evaluation_ran = self.current_episode
                self.evaluate(self.tp.evaluation_episodes)

            # snapshot model
            if self.tp.save_model_sec and self.tp.save_model_sec > 0 and not self.tp.distributed:
                total_training_time = time.time() - training_start_time
                current_snapshot_period = (int(total_training_time) //
                                           self.tp.save_model_sec)
                if current_snapshot_period > model_snapshots_periods_passed:
                    model_snapshots_periods_passed = current_snapshot_period
                    self.save_model(model_snapshots_periods_passed)

            # play and record in replay buffer
            if self.tp.agent.collect_new_data:
                if self.tp.agent.step_until_collecting_full_episodes:
                    step = 0
                    while step < self.tp.agent.num_consecutive_playing_steps or self.memory.get_episode(
                            -1).length() != 0:
                        self.act()
                        step += 1
                else:
                    for step in range(
                            self.tp.agent.num_consecutive_playing_steps):
                        self.act()

            # train
            if self.tp.train:
                for step in range(
                        self.tp.agent.num_consecutive_training_steps):
                    loss = self.train()
                    self.loss.add_sample(loss)
                    self.training_iteration += 1
                    if self.imitation:
                        self.log_to_screen(RunPhase.TRAIN)
                self.post_training_commands()

    def save_model(self, model_id):
        self.main_network.save_model(model_id)
Example #19
0
class ParamManager(object):
    def __init__(self):
        self.parameters = {}
        self.missing_ids = set()
        self._event = threading.Event()
        self.sig_changed = Signal()
        CommManager().register_model(self)

    @property
    def changed(self):
        return [p for p in self.parameters.values() if p.is_changed]

    def clear(self):
        self.parameters.clear()
        self.sig_changed.emit()

    def update_param(self, param_id, param_index, param_count, value):
        if len(self.missing_ids) == 0:
            self.missing_ids.update(range(param_count))

        p = self.parameters.get(param_id)
        if p:
            p._value = value
            p._changed = False
            log.debug("Update: %s: %s", p.param_id, p.value)
        else:
            self.parameters[param_id] = Parameter(param_id, param_index, value)
            log.debug("Add: %s: %s", param_id, value)

        self.missing_ids.discard(param_index)
        if len(self.missing_ids) == 0:
            log.debug("Retrive done")
            self._event.set()

    def retrieve_all(self):
        self.missing_ids = set()
        self._event.clear()

        # request all
        CommManager().param_request()
        self._event.wait(10.0)

        # not nesessary: try to request missing params
        if len(self.missing_ids) > 0:
            log.warn("Missing %d parameters, trying to request.",
                     len(self.missing_ids))
            self._event.clear()
            for idx in self.missing_ids:
                CommManager().param_request(param_index=idx)

            self._event.wait(10.0)

        if len(self.missing_ids):
            log.error("Missing %d parameters", len(self.missing_ids))

        self.sig_changed.emit()
        return len(self.missing_ids) == 0

    def sync(self):
        to_sync = self.changed
        if len(to_sync) == 0:
            log.info("Nothing to sync")
            self.sig_changed.emit()
            return True

        self.missing_ids = set((p.param_index for p in to_sync))
        self._event.clear()
        for p in to_sync:
            CommManager().param_set(p.param_id, p.value)

        self._event.wait(10.0)
        if len(self.missing_ids):
            log.error("Not synced %d parameters", len(self.missing_ids))

        self.sig_changed.emit()
        return len(self.missing_ids) == 0
Example #20
0
class NAFAgent(ValueOptimizationAgent):
    def __init__(self,
                 env,
                 tuning_parameters,
                 replicated_device=None,
                 thread_id=0):
        ValueOptimizationAgent.__init__(self, env, tuning_parameters,
                                        replicated_device, thread_id)
        self.l_values = Signal("L")
        self.a_values = Signal("Advantage")
        self.mu_values = Signal("Action")
        self.v_values = Signal("V")
        self.signals += [
            self.l_values, self.a_values, self.mu_values, self.v_values
        ]

    def learn_from_batch(self, batch):
        current_states, next_states, actions, rewards, game_overs, _ = self.extract_batch(
            batch)

        # TD error = r + discount*v_st_plus_1 - q_st
        v_st_plus_1 = self.main_network.target_network.predict(
            next_states,
            self.main_network.target_network.output_heads[0].V,
            squeeze_output=False,
        )
        TD_targets = np.expand_dims(rewards, -1) + (1.0 - np.expand_dims(
            game_overs, -1)) * self.tp.agent.discount * v_st_plus_1

        if len(actions.shape) == 1:
            actions = np.expand_dims(actions, -1)

        result = self.main_network.train_and_sync_networks(
            {
                **current_states, 'output_0_0': actions
            }, TD_targets)
        total_loss = result[0]

        return total_loss

    def choose_action(self, curr_state, phase=RunPhase.TRAIN):
        assert not self.env.discrete_controls, 'NAF works only for continuous control problems'

        # convert to batch so we can run it through the network
        # observation = np.expand_dims(np.array(curr_state['observation']), 0)
        naf_head = self.main_network.online_network.output_heads[0]
        action_values = self.main_network.online_network.predict(
            self.tf_input_state(curr_state),
            outputs=naf_head.mu,
            squeeze_output=False,
        )
        if phase == RunPhase.TRAIN:
            action = self.exploration_policy.get_action(action_values)
        else:
            action = action_values

        Q, L, A, mu, V = self.main_network.online_network.predict(
            {
                **self.tf_input_state(curr_state), 'output_0_0': action_values
            },
            outputs=[
                naf_head.Q, naf_head.L, naf_head.A, naf_head.mu, naf_head.V
            ],
        )

        # store the q values statistics for logging
        self.q_values.add_sample(Q)
        self.l_values.add_sample(L)
        self.a_values.add_sample(A)
        self.mu_values.add_sample(mu)
        self.v_values.add_sample(V)

        action_value = {"action_value": Q}
        return action, action_value
Example #21
0
 def __init__(self):
     self.messages = []
     self.sig_changed = Signal()
     CommManager().register_model(self)
Example #22
0
 def __init__(self):
     self.last_message = None
     self.sig_changed = Signal()
Example #23
0
 def __init__(self):
     self.parameters = {}
     self.missing_ids = set()
     self._event = threading.Event()
     self.sig_changed = Signal()
     CommManager().register_model(self)
Example #24
0
class RaceManager:
    
    testSpeedRatio = 1
    
    def __init__(self):
        self.fleets = []
        self.fleetsById = {}
        self.changed = Signal()
        self.finishes = []
        self.finishesById = {}
        # we store these on the race manager so that they get pickled
        self.nextFleetId = 1
        self.nextFinishId = 1
        
    #
    # this method controls how the RaceManager is pickled. We want to avoid pickling the Signal object
    # stored on the changed attribute
    #
    def __getstate__(self):
        attributes = self.__dict__.copy()
        del attributes["changed"]
        
        return attributes
    
    #
    # this method controls how the RaceManager is unpickled. We need to set the changed attribute
    # as it is not part of the pickle
    #
    def __setstate__(self,d):
        self.__dict__ = d
        self.changed = Signal()
         

    def incrementNextFleetId(self):
        self.nextFleetId = self.nextFleetId + 1

    def incrementNextFinishId(self):
        self.nextFinishId = self.nextFinishId + 1


        

    def adjustedSeconds(self,unadjustedSeconds):
        return unadjustedSeconds * RaceManager.testSpeedRatio
    
    def unadjustedSecond(self,adjustedSeconds):
        return adjustedSeconds / RaceManager.testSpeedRatio
    
    #
    # Create a fleet, add to our fleets and return the fleet. If the name is not specified,
    # we create a name as 'Fleet N' where N is the number of fleets.
    #
    def createFleet(self, name=None):
        aFleet = Fleet(name=name,fleetId=self.nextFleetId)
        self.incrementNextFleetId()
        self.addFleet(aFleet)
        return aFleet
    
    def fleetWithId(self,fleetId):
        if fleetId in self.fleetsById:
            return self.fleetsById[fleetId]
        

    def addFleet(self, aFleet):
        self.fleets.append(aFleet)
        self.fleetsById[aFleet.fleetId] = aFleet
        self.changed.fire("fleetAdded",aFleet)
        

    def removeFleet(self, aFleet):
        if aFleet in self.fleets:
            positionInList = self.fleets.index(aFleet)
            self.fleets.remove(aFleet)
            del self.fleetsById[aFleet.fleetId]
            self.changed.fire("fleetRemoved",aFleet)
            
        else:
            raise RaceException("Fleet not found",aFleet)
            

    def numberFleets(self):
        return len(self.fleets)
    
    def hasFleets(self):
        return self.numberFleets() > 0

    #
    # Start our race sequence in ten seconds with a five minute warning before the first
    # fleet, i.e. 10 minutes to the first fleet start. This is F flag start
    #
    def startRaceSequenceWithWarning(self):
        logging.info("Start sequence with warning (F flag start)")
        fleetNumber = 0
        
        now = datetime.now()
        sequenceStart = now + timedelta(seconds=10)
        for fleet in self.fleets:
            fleetNumber = fleetNumber + 1
            
            startTime = sequenceStart + timedelta(
                seconds = (WARNING_SECONDS/RaceManager.testSpeedRatio + 
                        (START_SECONDS * fleetNumber)/RaceManager.testSpeedRatio))

            self.updateFleetStartTime(fleet,startTime)
        self.changed.fire("sequenceStartedWithWarning")


    #
    # Start our race sequence without a warning (i.e. class start)
    #
    def startRaceSequenceWithoutWarning(self):
        logging.info("Start sequence without warning (class flag start)")
        fleetNumber = 0
        now = datetime.now()
        
        sequenceStart = now + timedelta(seconds=10)
        
        for fleet in self.fleets:
            fleetNumber = fleetNumber + 1
            
            startTime = sequenceStart + timedelta(
                seconds = (START_SECONDS * fleetNumber)/RaceManager.testSpeedRatio)

            self.updateFleetStartTime(fleet,startTime)
        self.changed.fire("sequenceStartedWithoutWarning")
    #
    # Update the startTime for a fleet. Do this through the race manager
    # so that the race manager can signal the event change
    #
    def updateFleetStartTime(self, aFleet, startTime):
        aFleet.startTime = startTime
        # signal that the fleet start time has changed
        self.changed.fire("fleetChanged",aFleet)
        
            

    #
    # Find the last fleet started. This is a reverse search
    # of the fleets list for a started fleet.
    # Returns None if not found
    #
    def lastFleetStarted(self):
        for fleet in reversed(self.fleets):
            if fleet.isStarted():
                return fleet
        return None
    
    #
    # Fine the next fleet to start. If we don't have a fleet starting,
    # return None.
    #
    def nextFleetToStart(self):
        for fleet in self.fleets:
            if fleet.isStarting() or fleet.isWaitingToStart():
                return fleet
        return None


    def hasStartedFleet(self):
        return not self.lastFleetStarted() is None
    
    
    def hasSequenceStarted(self):
        if self.nextFleetToStart():
            return True
        else:
            return False
    
    
    #
    # Reset start sequence - set all fleets to no start time, and fire a signal
    #
    def resetStartSequence(self):
        for fleet in self.fleets:
            fleet.startTime = None
        self.removeAllFinishes()
        self.changed.fire("startSequenceReset")

    def lastFleet(self):
        return self.fleets[-1]

    #
    # Perform a general recall. This is always for the fleet that
    # has most recently started
    #
    def generalRecall(self):
        logging.info("General recall")
        fleetToRecall = self.lastFleetStarted()
        
        
        

        # if this is the last (or only) fleet, set its start time to be six
        # minutes from now
        if fleetToRecall == self.fleets[-1]:
            logging.info("General recall last fleet")
            self.updateFleetStartTime(fleetToRecall,datetime.now()
                                 + timedelta(seconds=(START_SECONDS+LAST_START_GENERAL_RECALL_DELAY)/RaceManager.testSpeedRatio))

        # otherwise kick the fleet to be the back of the queue,
        # with a start time five minutes after the last fleet
        else:
            
            self.removeFleet(fleetToRecall)
            lastFleet = self.fleets[-1]
            self.updateFleetStartTime(fleetToRecall,
                    lastFleet.startTime + timedelta(seconds=START_SECONDS/RaceManager.testSpeedRatio))
            self.addFleet(fleetToRecall)
            logging.log(logging.INFO, "General recall not last fleet. Moving to back of queue. Delta to start time now %d seconds",
                        fleetToRecall.adjustedDeltaSecondsToStartTime())
            
        self.changed.fire("generalRecall", fleetToRecall)

        
    #
    # Create a finish and add it to the race manager's list of finishes. 
    # This method returns a finish object. By default, the finish object will
    # have a finish time of now and no fleet.
    #
    def createFinish(self, fleet=None, finishTime=None):
        
        # if no finish time is supplied, set the finish time to be now
        if not finishTime:
            finishTime = datetime.now()
        # create the finish object
        
        # if we only have one fleet, this will be the fleet for the finish
        if self.numberFleets() == 1:
            fleet = self.fleets[0]
        
        aFinish = Finish(fleet=fleet,finishTime=finishTime,finishId=self.nextFinishId)
        self.incrementNextFinishId()
        
        self.addFinish(aFinish)
        
        return aFinish
        
    
    def addFinish(self,finish):
        # add it to our list of finish objects
        self.finishes.append(finish)
        self.finishesById[finish.finishId] = finish
        # fire a change signal
        self.changed.fire("finishAdded",finish)
        
    def removeFinish(self,finish):
        self.finishes.remove(finish)
        del self.finishesById[finish.finishId]
        self.changed.fire("finishRemoved",finish)
        
    def updateFinish(self,finish):
        self.changed.fire("finishChanged",finish)
        
    def removeAllFinishes(self):
        for finish in list(self.finishes):
            self.removeFinish(finish)
        
    def finishWithId(self,finishId):
        if finishId in self.finishesById:
            return self.finishesById[finishId]
Example #25
0
    def __init__(self,
                 env,
                 tuning_parameters,
                 replicated_device=None,
                 task_id=0):
        """
        :param env: An environment instance
        :type env: EnvironmentWrapper
        :param tuning_parameters: A Preset class instance with all the running paramaters
        :type tuning_parameters: Preset
        :param replicated_device: A tensorflow device for distributed training (optional)
        :type replicated_device: instancemethod
        :param thread_id: The current thread id
        :param thread_id: int
        """

        screen.log_title("Creating agent {}".format(task_id))
        self.task_id = task_id
        self.sess = tuning_parameters.sess
        self.env = tuning_parameters.env_instance = env
        self.imitation = False

        # i/o dimensions
        if not tuning_parameters.env.desired_observation_width or not tuning_parameters.env.desired_observation_height:
            tuning_parameters.env.desired_observation_width = self.env.width
            tuning_parameters.env.desired_observation_height = self.env.height
        self.action_space_size = tuning_parameters.env.action_space_size = self.env.action_space_size
        self.measurements_size = tuning_parameters.env.measurements_size = self.env.measurements_size
        if tuning_parameters.agent.use_accumulated_reward_as_measurement:
            self.measurements_size = tuning_parameters.env.measurements_size = (
                self.measurements_size[0] + 1, )

        # modules
        if tuning_parameters.agent.load_memory_from_file_path:
            screen.log_title(
                "Loading replay buffer from pickle. Pickle path: {}".format(
                    tuning_parameters.agent.load_memory_from_file_path))
            self.memory = read_pickle(
                tuning_parameters.agent.load_memory_from_file_path)
        else:
            self.memory = eval(tuning_parameters.memory +
                               '(tuning_parameters)')
        # self.architecture = eval(tuning_parameters.architecture)

        self.has_global = replicated_device is not None
        self.replicated_device = replicated_device
        self.worker_device = "/job:worker/task:{}/cpu:0".format(
            task_id) if replicated_device is not None else "/gpu:0"

        self.exploration_policy = eval(tuning_parameters.exploration.policy +
                                       '(tuning_parameters)')
        self.evaluation_exploration_policy = eval(
            tuning_parameters.exploration.evaluation_policy +
            '(tuning_parameters)')
        self.evaluation_exploration_policy.change_phase(RunPhase.TEST)

        # initialize all internal variables
        self.tp = tuning_parameters
        self.in_heatup = False
        self.total_reward_in_current_episode = 0
        self.total_steps_counter = 0
        self.running_reward = None
        self.training_iteration = 0
        self.current_episode = self.tp.current_episode = 0
        self.curr_state = {}
        self.current_episode_steps_counter = 0
        self.episode_running_info = {}
        self.last_episode_evaluation_ran = 0
        self.running_observations = []
        logger.set_current_time(self.current_episode)
        self.main_network = None
        self.networks = []
        self.last_episode_images = []
        self.renderer = Renderer()

        # signals
        self.signals = []
        self.loss = Signal('Loss')
        self.signals.append(self.loss)
        self.curr_learning_rate = Signal('Learning Rate')
        self.signals.append(self.curr_learning_rate)

        if self.tp.env.normalize_observation and not self.env.is_state_type_image:
            if not self.tp.distributed or not self.tp.agent.share_statistics_between_workers:
                self.running_observation_stats = RunningStat(
                    (self.tp.env.desired_observation_width, ))
                self.running_reward_stats = RunningStat(())
            else:
                self.running_observation_stats = SharedRunningStats(
                    self.tp,
                    replicated_device,
                    shape=(self.tp.env.desired_observation_width, ),
                    name='observation_stats')
                self.running_reward_stats = SharedRunningStats(
                    self.tp, replicated_device, shape=(), name='reward_stats')

        # env is already reset at this point. Otherwise we're getting an error where you cannot
        # reset an env which is not done
        self.reset_game(do_not_reset_env=True)

        # use seed
        if self.tp.seed is not None:
            random.seed(self.tp.seed)
            np.random.seed(self.tp.seed)
Example #26
0
 def __setstate__(self,d):
     self.__dict__ = d
     self.changed = Signal()
Example #27
0
 def __init__(self):
     self.parameters = {}
     self.missing_ids = set()
     self._event = threading.Event()
     self.sig_changed = Signal()
     CommManager().register_model(self)
Example #28
0
class Database:
    __slots__ = ['_path', '_signal', '_db']

    Event = namedtuple('Event', ['type', 'feed', 'link'])

    def __init__(self, path: str):
        self._path = path
        self._signal = Signal()
        self._db = self._load_db()

    def __getitem__(self, key):
        return self._db[key]

    def feeds(self):
        return self._db.keys()

    def channels(self):
        return (ch for chs in self._db.values() for ch in chs.values())

    def channels_of_feed(self, feed: str, links: bool = True):
        origin = self._db[feed].keys if links else self._db[feed].values
        return (ch for ch in origin())

    def add_feed(self, link: str):
        self._db[link] = {}
        self.flush()

    def add_channel(self, link: str, last_id: int, feed: str):
        self._db[feed][link] = Channel(link, last_id, feed)
        self.flush()
        self._publish(self.Event('add', feed, link))

    def remove_channel(self, link: str, feed: str):
        del self._db[feed][link]
        self.flush()
        self._publish(self.Event('remove', feed, link))

    def remove_feed(self, feed: str):
        for link in self._db[feed].keys():
            self._publish(self.Event('remove', feed, link))
        del self._db[feed]
        self.flush()

    def feed_nonempty(self, feed: str):
        return bool(self._db[feed].keys())

    def feed_exists(self, link: str):
        return link in self._db.keys()

    def channel_exists(self, link: str, feed: str = None):
        return link in self._db[feed].keys() if feed else any(
            link in chs.keys() for chs in self._db.values())

    def flush(self):
        with open(self._path, 'w') as db:
            json.dump(self._db, db)

    def subscribe(self, receiver: Callable[[Database.Event], None]):
        self._signal.connect(receiver)

    def _publish(self, event: Database.Event):
        self._signal(event)

    def _load_db(self) -> Dict[str, Dict[str, Channel]]:
        try:
            with open(self._path) as db:
                _db = json.load(db)
                for channels in _db.values():
                    for link in channels.keys():
                        channels[link] = Channel(*channels[link])
                return _db
        except FileNotFoundError:
            return {}