Beispiel #1
0
def main(desired_iterations, save_path):
    # Define a log file to use with tensorboard
    # Not that we currently make use of tensorboard at all
    LOG_DIR = tempfile.mkdtemp()
    print "Tensorboard Log: " + LOG_DIR + '\n'

    # The directory to save the animations to
    SAVE_DIR = save_path

    # Define the simulation
    sim = Planning(get_noodle_environment())

    # Tensorflow!
    tf.reset_default_graph()
    session = tf.InteractiveSession()
    journalist = tf.train.SummaryWriter(LOG_DIR)
    brain = MLP([
        sim.observation_size,
    ], [200, 200, sim.num_actions], [tf.tanh, tf.tanh, tf.identity])
    optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001, decay=0.9)

    # DiscreteDeepQ object
    current_controller = DiscreteDeepQ(sim.observation_size,
                                       sim.num_actions,
                                       brain,
                                       optimizer,
                                       session,
                                       random_action_probability=0.2,
                                       discount_rate=0.9,
                                       exploration_period=1000,
                                       max_experience=10000,
                                       store_every_nth=1,
                                       train_every_nth=1,
                                       summary_writer=journalist)

    # Initialize the session
    session.run(tf.initialize_all_variables())
    session.run(current_controller.target_network_update)
    # journalist.add_graph(session.graph)

    # Run the simulation and let the robot learn
    num_simulations = 0

    iterations_needed = []
    total_rewards = []

    try:
        for game_idx in range(desired_iterations + 1):
            current_random_prob = current_controller.random_action_probability
            update_random_prob = game_idx != 0 and game_idx % 200 == 0
            if update_random_prob and 0.01 < current_random_prob <= 0.1:
                current_controller.random_action_probability = current_random_prob - 0.01
            elif update_random_prob and 0.1 < current_random_prob:
                current_controller.random_action_probability = current_random_prob - 0.1
            game = Planning(get_noodle_environment())
            game_iterations = 0

            observation = game.observe()
            while not game.is_over():
                action = current_controller.action(observation)
                reward = game.collect_reward(action)
                new_observation = game.observe()
                current_controller.store(observation, action, reward,
                                         new_observation)
                current_controller.training_step()
                observation = new_observation
                game_iterations += 1
            total_rewards.append(sum(game.collected_rewards))
            iterations_needed.append(game_iterations)
            rewards = []
            if game_idx % 50 == 0:
                print "\rGame %d:\nIterations before end: %d." % (
                    game_idx, game_iterations)
                if game.collected_rewards[-1] == 10:
                    print "Hit target!"
                print "Total Rewards: %s\n" % (sum(game.collected_rewards))
                if SAVE_DIR is not None:
                    game.save_path(SAVE_DIR, game_idx)

    except KeyboardInterrupt:
        print "Interrupted"

    # Plot the iterations and reward
    plt.figure(figsize=(12, 8))
    plt.plot(total_rewards, label='Reward')
    # plt.plot(iterations_needed, label='Iterations')
    plt.legend()
    plt.show()
Beispiel #2
0
journalist.add_graph(session.graph_def)


# In[10]:

performances = []

try:
    for game_idx in range(2000):
        game = DiscreteHill()
        game_iterations = 0

        observation = game.observe()

        while game_iterations < 50 and not game.is_over():
            action = current_controller.action(observation)
            reward = game.collect_reward(action)
            game.perform_action(action)
            new_observation = game.observe()
            current_controller.store(observation, action, reward, new_observation)
            current_controller.training_step()
            observation = new_observation
            game_iterations += 1
        performance = float(game_iterations - (game.shortest_path)) / game.shortest_path
        performances.append(performance)
        if game_idx % 100 == 0:
            print "\rGame %d: iterations before success %d." % (game_idx, game_iterations),
            print "Pos: %s, Target: %s" % (game.position, game.target),
except KeyboardInterrupt:
    print "Interrupted"
Beispiel #3
0
performances = []

dt = 0.02
try:
    for game_idx in range(10000):
        game = Quadrotor()
        game_iterations = 0

        observation = game.observe()
        x0 = copy.deepcopy(observation)
        rewards = []
        cost0 = game.cost()
        path = [copy.deepcopy(observation)]
        while game_iterations < 100 and not game.is_over():
            action = current_controller.action(observation)
            game.perform_action(action)
            game.step(dt)
            cost1 = game.cost()

            reward = cost0 - cost1 - 2
            # reward = -reward
            rewards.append(reward)
            new_observation = game.observe()
            current_controller.store(observation, action, reward,
                                     new_observation)
            current_controller.training_step()

            observation = new_observation
            cost0 = cost1
Beispiel #4
0
# In[330]:

performances = []

try:
    for game_idx in range(2000):
        game = DiscreteHill()
        game_iterations = 0

        observation = game.observe()
        
        prev_frames = [(observation, -1)] * (n_prev_frames - 1)
        memory = np.concatenate([np.concatenate([observation, np.array([-1])])] * (n_prev_frames - 1) + [observation])
        
        while game_iterations < 50 and not game.is_over():
            action = current_controller.action(memory)
            if n_prev_frames > 1:
                prev_frames = prev_frames[1:] + [(observation, action)]
            reward = game.collect_reward(action)
            game.perform_action(action)
            observation = game.observe()
            new_memory = np.concatenate([np.concatenate([a, np.array([b])]) for (a, b) in prev_frames] + [observation])
            current_controller.store(memory, action, reward, new_memory)
            current_controller.training_step()
            memory = new_memory
            game_iterations += 1
            cost = abs(game.target[0]) + abs(game.target[1])
        performances.append((game_iterations - cost) / float(cost))
        if game_idx % 100 == 0:
            print "\rGame %d: iterations before success %d." % (game_idx, game_iterations),
            print "Pos: %s, Target: %s" % (game.position, game.target),
Beispiel #5
0
class MLDaemon:
    """MLDaemon of ASCAR
 
    All public members are thread-safe.
 
    :type controller: DiscreteDeepQ
    :type opt: dict
    :type conf: dict
    :type session: tf.Session
    """

    controller = None  # Tensorflow controller for MLP
    session = None  # Tensorflow session

    debugging_level = 0  # debug level for log
    disable_training = False  # training default is enable
    enable_tuning = False  # tuning default is set to disable
    stop_requested = False  # stop training and action deciding
    stopped = True  # stop daemon

    opt = None  # option configuration for deep learning
    conf = None  # all configurations

    delay_between_actions = 1  # seconds between actions
    exploration_period = 5000  # exploration interval in one period
    start_random_rate = 0.5  # random rate for selecting samples
    checkpoint_time = 1800  # time of saving each checkpoint

    last_observation = None  # last observation used in training process
    last_action = None  # last set of parameter configurations
    last_step = None  # last set of parameter step size
    new_action = None  # new action that will be broadcast to storage system
    save_path = None  # save path of model and log file

    cumulative_reward = 0  # cumulative rewards that Deep Q learning get at the end

    test_number_of_steps_after_restore = 0  # number of test steps after restore tensorflow model
    memcache_last_rowid = 0  # lastest row of memcache

    def __init__(self, conf: dict = None, opt: dict = None):
        tf.disable_v2_behavior()  # disable tensorflow version 2

        # get debugging level from config file
        if 'mldaemon_debugging_level' in conf['log']:
            self.debugging_level = conf['log']['mldaemon_debugging_level']

        # assign configuration and option
        self.opt = opt
        self.conf = conf

        # get directory for saving model and log_dir
        self.save_path = os.path.dirname(conf['replaydb']['dbfile'])

        self.disable_training = self.opt[
            'disable_training']  # get disable_training option from config file

        self.minibatch_size = self.opt[
            'minibatch_size']  # mini batch size for training in one observation (Size of PIs in one row of pi db)
        self.ticks_per_observation = self.opt[
            'ticks_per_observation']  # Number of Ticks (depends of tick_len) per 1 observation
        self.observation_size = len(list(self.conf['node']['client_id'].values())) * \
            len(self.conf['ceph-param']) * self.ticks_per_observation

        # setup tuning system configuration (Description is at the beginning)
        if 'delay_between_actions' in opt:
            self.delay_between_actions = opt['delay_between_actions']
        if 'exploration_period' in opt:
            self.exploration_period = opt['exploration_period']
        if 'start_random_rate' in opt:
            self.start_random_rate = opt['start_random_rate']
        if 'checkpoint_time' in opt:
            self.checkpoint_time = opt['checkpoint_time']

        self.enable_tuning = self.opt['enable_tuning']

        # Initialize database and retrieve data from database
        self.db = ReplayDB(self.opt, self.conf)
        self.db.refresh_memcache()

        # Store default action
        default = []
        default_step = []
        for param in self.conf['ceph-param']:
            val = list(param.values())[0]
            default.append(val['default'])
            default_step.append(val['step'])
        self.last_action = default
        self.last_step = default_step

        # make temp file for storing tensorflow log
        self.LOG_DIR = tempfile.mkdtemp()
        logger.info(
            f"LOG_DIR is locate at {self.LOG_DIR}. To enable Tensorboard run 'tensorboard --logdir [LOG_DIR]'"
        )

    def start(self):
        """Start MLDaemon
        
        This function create tensorflow controller and running the tuning by iteratively 
        training and choose action.
        """
        if self.debugging_level >= 1:
            import cProfile
            import io
            import pstats
            pr = cProfile.Profile()
            pr.enable()

        logger.info(f"Connected to database {self.conf['replaydb']['dbfile']}")

        # set stopped to False, so daemon can run
        self.stopped = False

        logger.info('Starting MLDaemon...')
        try:
            # TensorFlow business - it is always good to reset a graph before creating a new controller.
            ops.reset_default_graph()
            # ? shall we use InteractiveSession()?
            self.session = tf.Session()  # tf.InteractiveSession()

            # This little guy will let us run tensorboard
            #      tensorboard --logdir [LOG_DIR]
            journalist = tf.summary.FileWriter(self.LOG_DIR)

            # Brain maps from observation to Q values for different actions.
            # Here it is a done using a multi layer perceptron with 2 hidden
            # layers
            hidden_layer_size = max(int(self.observation_size * 1.2), 200)
            logger.info('Observation size {0}, hidden layer size {1}'.format(
                self.observation_size, hidden_layer_size))
            brain = MLP([
                self.observation_size,
            ], [hidden_layer_size, hidden_layer_size, self.opt['num_actions']],
                        [tf.tanh, tf.tanh, tf.identity])

            # The optimizer to use. Here we use RMSProp as recommended
            # by the publication
            optimizer = tf.train.RMSPropOptimizer(learning_rate=0.001,
                                                  decay=0.9)
            # DiscreteDeepQ object
            self.controller = DiscreteDeepQ(
                (self.observation_size, ),
                self.opt['num_actions'],
                brain,
                optimizer,
                self.session,
                discount_rate=0.99,
                start_random_rate=self.start_random_rate,
                exploration_period=self.exploration_period,
                random_action_probability=self.
                opt['random_action_probability'],
                train_every_nth=1,
                summary_writer=journalist,
                k_action=int(self.opt['k_val']))

            self.session.run(tf.initialize_all_variables())
            self.session.run(self.controller.target_network_update)

            #checks if there is a model to be loaded before updating the graph
            if os.path.isfile(os.path.join(self.save_path, 'model')):
                self.controller.restore(self.save_path)
                logger.info('Loaded saved model from ' + self.save_path)
            else:
                logger.info('No saved model found')
            self.test_number_of_steps_after_restore = self.controller.actions_executed_so_far

            # graph was not available when journalist was created
            journalist.add_graph(self.session.graph)

            last_action_second = 0  # last action timestep
            last_training_step_duration = 0  # last training duration
            last_checkpoint_time = time.time()  # last checkpoint
            while not self.stop_requested:
                begin_time = time.time()  # set begin time to current time

                # Run training step
                logger.info('Start training step...')
                minibatch_size, prediction_error = self._do_training_step()

                if minibatch_size > 0:
                    # Check checkpoint time for every self.checkpoint_time
                    logger.info(
                        f'Time before checkpoint: {self.checkpoint_time - (time.time() - last_checkpoint_time)}'
                    )
                    if time.time(
                    ) - last_checkpoint_time > self.checkpoint_time:
                        # save controller checkpoint
                        cp_path = os.path.join(
                            self.save_path,
                            'checkpoint_' + time.strftime('%Y-%m-%d_%H-%M-%S'))
                        os.mkdir(cp_path)
                        self.controller.save(cp_path)
                        # update checkpoint time
                        last_checkpoint_time = time.time()
                        logger.info('Checkpoint saved in ' + cp_path)

                    # update last training duration
                    last_training_step_duration = time.time() - begin_time
                    logger.info(
                        'Finished {step}th training step in {time} seconds '
                        'using {mb} samples with prediction error {error}.'.
                        format(step=self.controller.iteration,
                               time=last_training_step_duration,
                               mb=minibatch_size,
                               error=prediction_error))
                else:
                    logger.info('Not enough data for training yet.')

                # Check if it is time for tuning
                # (check if duration since last action passed compare to time left before next actions)
                if time.time() - (
                        last_action_second + 0.5
                ) >= self.delay_between_actions - last_training_step_duration:
                    if self.enable_tuning:
                        logger.debug('Start tuning step...')

                        try:
                            # Update memcache for next traininf interval
                            self.db.refresh_memcache()
                        except:
                            pass

                        # get sleep time either 0 or what is left until next action is start
                        sleep_time = max(
                            0, self.delay_between_actions -
                            (time.time() - (last_action_second + 0.5)))
                        if sleep_time > 0.05:
                            # Do garbage cleaning up before long sleeping
                            gc.collect()
                            sleep_time = max(
                                0, self.delay_between_actions -
                                (time.time() - (last_action_second + 0.5)))
                        if sleep_time > 0.0001:
                            logger.debug(f'Sleeping {sleep_time} seconds')
                            # Welp, basically sleep
                            time.sleep(sleep_time)

                        # Do action step
                        ts = int(time.time())
                        self._do_action_step(ts)
                        # Update action to current time
                        last_action_second = ts
                    else:
                        logger.debug('Tuning disabled.')
                        # Check for new data every 200 steps to reduce checking overhead
                        if self.controller.number_of_times_train_called % 200 == 0:
                            try:
                                self.db.refresh_memcache()
                                pass
                            except:
                                pass

                    # We always print out the reward to the log for analysis
                    logger.info(f'Cumulative reward: {self.cumulative_reward}')

                    # Clean log at the end for next run
                    flush_log()
        finally:
            # set stopped to True, so daemon can properly stop
            self.stopped = True
            # controller.save should not work here as the controller is still NoneType
            # self.controller.save(self.save_path)
            logger.info('MLDaemon stopped.')

            if self.debugging_level >= 1:
                pr.disable()
                s = io.StringIO()
                sortby = 'cumulative'
                ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
                ps.print_stats()
                print(s.getvalue())

    @staticmethod
    def store(*unused):
        pass

    def _do_training_step(self) -> (int, float):
        """Do a training step
        
        This function is NOT thread-safe and can only be called within the worker thread.
        
        Raises:
            RuntimeError: Training is set to disable
            
        Returns:
            (int, float): size of the mini batch, prediction error
        """
        if not self.disable_training:
            # Get training batch from memcache in replay database
            mini_batch = self.get_minibatch()
            if mini_batch:
                logger.info(f'Retrieve batch size: {len(mini_batch)}')
                return len(mini_batch), self.controller.training_step(
                    mini_batch)
            else:
                return 0, None
        else:
            raise RuntimeError('Training is disabled')

    def _do_action_step(self, ts):
        """ Do an action step
 
        This function is NOT thread-safe and can only be called within the worker thread.
 
        Raises:
            RuntimeError: Tuning is disable, so no action will be perform
        """
        if not self.enable_tuning:
            raise RuntimeError('Tuning is disabled')

        try:
            # get new observation
            new_observation = self.observe()
            # collect reward
            reward = self.collect_reward()
        except BaseException as e:
            logger.info('{0}. Skipped taking action.'.format(str(e)))
            traceback.print_exc()
            return

        # Store last transition. This is only needed for the discrete hill test case.
        if self.last_observation is not None:
            # TODO: Implement store function (CAPES haven't implement this function yet)
            self.store(self.last_observation, self.last_action, reward,
                       new_observation)
            pass

        # get action from new observation
        self.new_action = self.controller.action(new_observation)
        # Perform action
        self.perform_action(self.new_action, ts)

        # Update last observation to current one
        self.last_observation = new_observation

    def get_minibatch(self):
        """Get mini batch for training
        
        This function is NOT thread-safe and can only be called within the worker thread.
        It calls ReplayDB to retrieve mini batch
 
        Returns:
            list: mini batch containing timestep, action, reward of the action, observation of 
        a current timestep, and next observation
        """
        # We need at least ticks_per_observation+1 ticks for one sample
        if len(self.db.memcache) < self.ticks_per_observation + 1:
            return None

        result = []  # mini batch that will be return in the end
        required_samples = self.minibatch_size  # samples per batch
        while True:
            # remaining size in memcache subtracted by ticks per observation
            total_sample_size = len(
                self.db.memcache) - self.ticks_per_observation
            # If possible sample size is not enough
            if total_sample_size <= len(result):
                return result
            # get required sample size with regards to number of sample left in memcache
            required_samples = min(total_sample_size, required_samples)
            # The last idx has to be excluded so it won't be added to bad_idx set (last idx
            # is not yet decided on action)
            # pick sample index at random from tick_per_observation -1 ticks to last ticks
            # with size of remaining required sample

            # Random a sample idx with required_sample size
            sample_idx = random.sample(
                range(self.ticks_per_observation - 1,
                      len(self.db.memcache) - 1),
                required_samples - len(result))
            for i in sample_idx:
                try:
                    # get observation and next observation from sample index
                    observ = self.get_observation_by_cache_idx(i)
                    observ_next = self.get_next_observation_by_cache_idx(i)

                    # calculate reward from observation of current step and next step
                    nomalize_total_lat = self.inverse_norm_latency(self._calc_total_latency(observ_next)) - \
                        self.inverse_norm_latency(self._calc_total_latency(observ))

                    reward = (self._calc_total_throughput(observ_next) -
                              self._calc_total_throughput(observ)
                              ) + nomalize_total_lat

                    # The final ts is only used in test cases
                    # append data into result as tuple
                    ts = self.db.memcache[i][0]
                    action = self.db.memcache[i][1]
                    # Prevent getting observation without action append to training batch
                    if (action != [-1]) and (
                            action != [None] * len(self.conf['ceph-param'])):
                        result.append(
                            (observ, action, reward, observ_next, ts))

                    if len(result) == required_samples:
                        logger.debug(
                            f'Retrieved mini batach with data: {result}')
                        return result
                except NotEnoughDataError:
                    logger.info(f'NotEnoughDataError for memcache idx {i}')

    def inverse_norm_latency(self, x):
        """ Normalize inverse of latency value
 
            Normalize of inverse latency value to have the same range of 
            value as bandwith
 
            Returns:
                float: a normalize inverse of latency value
 
        """
        if (self.opt['min_latency'] > 0):
            inv_min_lat = 1 / self.opt['min_latency']
        else:
            inv_min_lat = 0

        if (self.opt['max_latency'] > 0):
            inv_max_lat = 1 / self.opt['max_latency']
        else:
            inv_max_lat = 0

        if (x > 0):
            inv_lat = 1 / x
        else:
            inv_lat = 0

        return (inv_lat - inv_min_lat) * (self.opt['max_bandwidth'] -
                                          self.opt['min_bandwidth']) / (
                                              inv_max_lat - inv_min_lat)

    def observe(self) -> np.ndarray:
        """ Return lastest observation vector using memcache.
        
        Get observation vector using memcache
 
        Raises:
            NotEnoughDataError: ticks in memcache is not enough
            NotEnoughDataError: cannot get observation from 
                self.get_observation_by_cache_idx(idx)
 
        Returns:
            np.ndarray: Observation at index idx
        """

        err_msg = 'No valid observation in the past two seconds'

        # If ticks in memcache is not enough
        if len(self.db.memcache) < self.ticks_per_observation:
            raise NotEnoughDataError(err_msg)
        # Loop from last ticks (last memcache index) to at least third last ticks or ticks_per_observation tick
        for idx in range(
                len(self.db.memcache) - 1,
                max(len(self.db.memcache) - 3, self.ticks_per_observation - 1),
                -1):
            try:
                # Return lastest possible observation
                return self.get_observation_by_cache_idx(idx)
            except NotEnoughDataError:
                pass
        # No observation found, so raise with err_msg
        raise NotEnoughDataError(err_msg)

    def collect_reward(self):
        """ Reward is the sum of read throughput + write throughput of clients
 
        Return reward for the last observation
        
        Returns:
            float: reward from current total thoughput and previous total throughput
        """
        observ, prev_observ = self.db.get_last_n_observation(2)

        nomalize_total_lat = self.inverse_norm_latency(self._calc_total_latency(observ)) - \
                        self.inverse_norm_latency(self._calc_total_latency(prev_observ))

        return (self._calc_total_throughput(observ) -
                self._calc_total_throughput(prev_observ)) + nomalize_total_lat

    def get_observation_by_cache_idx(self, idx: int) -> np.ndarray:
        """ Get observation from index
 
        Args:
            idx (int): observation index
 
        Raises:
            NotEnoughDataError: Index is greater than tick_per_observation
            NotEnoughDataError: Index does not belong to the same observation
            NotEnoughDataError: Missing entry exceed the missing_entry_tolerance
 
        Returns:
            np.ndarray: observation data of the index
        """

        # Check if idx is out of range
        assert 0 <= idx < len(self.db.memcache)

        # Check if index is consider in the observation
        if idx < self.ticks_per_observation - 1:
            raise NotEnoughDataError
        # Return None if the time is not continuous
        idx_start = idx - self.ticks_per_observation + 1

        # Create result nd.array for storing data from memcache
        # With client_len, number of tick per observation, data per client
        result = np.zeros(
            (len(self.db.client_list), self.ticks_per_observation,
             len(self.conf['ceph-param'])),
            dtype=float)
        missing_entry = 0

        # Loop each index until reach idx
        for i in range(idx_start, idx + 1):
            # Loop each client
            for client_id_idx in range(len(self.db.client_list)):
                # Check for missing entry
                if self.db.memcache[i][2][client_id_idx] is None:
                    missing_entry += 1  # Add missing entry
                    # Check for tolerance
                    if missing_entry > self.db.missing_entry_tolerance:
                        raise NotEnoughDataError('Too many missing entries')
                else:
                    # Add PIs data at i to result list
                    result[client_id_idx, i -
                           idx_start] = self.db.memcache[i][2][client_id_idx]
        # return result vector
        return result.reshape((self.observation_size, ))

    def get_next_observation_by_cache_idx(self, idx: int) -> np.ndarray:
        """Get next observation from idx
 
        Args:
            idx (int): observation index
 
        Raises:
            NotEnoughDataError: index is last tick in memcache
            NotEnoughDataError: next tick is not equal to ts at idx + tick_len
 
        Returns:
            np.ndarray: next observation
        """

        # Check if idx is out of range
        assert 0 <= idx < len(self.db.memcache)
        # if index is the last tick
        if idx == len(self.db.memcache) - 1:
            raise NotEnoughDataError
        # # Check if next tick is continuous
        # if self.db.memcache[idx][0] + self.opt['tick_len'] != self.db.memcache[idx + 1][0]:
        #     logger.info('or sth here')
        #     raise NotEnoughDataError
        # Check observation at next tick
        return self.get_observation_by_cache_idx(idx + 1)

    def _calc_total_throughput(self, observ: np.ndarray) -> float:
        """ Calculate total throughput of an observation
 
        Only the throughput of last tick in the observation are included in the reward.
 
        Args:
            observ (np.ndarray): observation
 
        Returns:
            float: total throughput
        """
        # Get number of client
        if 'client_id' in self.conf['node']:
            client_num = len(self.conf['node']['client_id'])
        else:
            client_num = 1

        # reshape also checks the shape of observ
        observ = np.reshape(observ, (client_num, self.ticks_per_observation,
                                     len(self.conf['ceph-param'])))
        result = 0.0  # throughput result

        # Loop through each client
        for client_idx in range(client_num):
            # Loop through each server (Should start from 0)
            for osc in range(len(self.conf['node']['server_addr'])):
                # Get read_bytes, write_bytes indicies
                read_ix = osc * (self.opt['pi_per_client_obd']) + 0
                write_ix = osc * (self.opt['pi_per_client_obd']) + 1
                # Get read_bytes, writes_bytes from observation
                read_bytes = observ[client_idx, self.ticks_per_observation - 1,
                                    read_ix]
                write_bytes = observ[client_idx,
                                     self.ticks_per_observation - 1, write_ix]
                # sanity check: our machine can't be faster than 300 MB/s
                assert 0 <= read_bytes <= 300 * 1024 * 1024
                assert 0 <= write_bytes <= 300 * 1024 * 1024
                result += read_bytes + write_bytes
        return result

    def _calc_total_latency(self, observ: np.ndarray) -> float:
        """ Calculate latency of an observation
 
        Only the lantency of last tick in the observation are included in the reward.
 
        Args:
            observ (np.ndarray): observation
 
        Returns:
            float: total throughput
        """
        # Get number of client
        if 'client_id' in self.conf['node']:
            client_num = len(self.conf['node']['client_id'])
        else:
            client_num = 1

        # reshape also checks the shape of observ
        observ = np.reshape(observ, (client_num, self.ticks_per_observation,
                                     len(self.conf['ceph-param'])))
        result = 0.0  # throughput result

        # Loop through each client
        for client_idx in range(client_num):
            # Loop through each server
            for osc in range(len(self.conf['node']['server_addr'])):
                # Get latency read and write indicies
                latency_r_ix = osc * (self.opt['pi_per_client_obd']) + 2
                latency_w_ix = osc * (self.opt['pi_per_client_obd']) + 3

                # Get latency read and write from observation
                latency_r = observ[client_idx, self.ticks_per_observation - 1,
                                   latency_r_ix]
                latency_w = observ[client_idx, self.ticks_per_observation - 1,
                                   latency_w_ix]
                result += latency_r + latency_w
        return result

    def perform_action(self, actions, ts):
        """Send the new action to IntfDaemon
 
        Args:
            action: An action predicted by discrete_deepq
        """
        # logger.info(f'{action}')
        # assert action.shape == tuple([1,4])
        # assert 0 <= action_id < self.opt['num_actions']

        action_opt = {
            0: self._increase_p_s,
            1: self._increase_p_decrease_s,
            2: self._increase_p_dna,
            3: self._decrease_p_increase_s,
            4: self._decrease_p_s,
            5: self._decrease_p_dna
        }
        for action_id in actions:
            if action_id > 0:
                param_id = (action_id - 1) // 6
                logger.info(f"This is {param_id}")
                logger.info(f"This is {self.conf['ceph-param'][param_id]}")
                param_valu = list(
                    self.conf['ceph-param'][param_id].values())[0]
                param_type = param_valu['type']
                min_val = param_valu['min']
                max_val = param_valu['max']
                step_change = self.opt['stepsize_change']
                if (param_type == "str"):
                    # TODO: Handle string type parameter
                    continue
                else:
                    action_opt[action_id % 6](param_id, min_val, max_val,
                                              step_change)

        self.db.connect_db()
        for t in (ts - self.delay_between_actions, ts):
            try:
                logger.info(f'insert action at: {t}')
                self.db.insert_action(t, self.last_action)
            except sqlite3.IntegrityError as e:
                pass
        self.db.conn.close()
        # # Broadcast action must begin with action_id, which will be saved by
        # # IntfDaemon to the DB.
        ControllerInft.broadcastAction(self.last_action, ts, self.conf,
                                       self.opt)

    def _increase_p_s(self, param_id, min_val, max_val, step_change):
        next_step_size = self.last_step[param_id] + step_change
        if self.last_action[param_id] + next_step_size > max_val:
            # invalid move
            pass
        else:
            # Do increase step size and parameter value
            self.last_step[param_id] += step_change
            self.last_action[param_id] += self.last_step[param_id]

    def _increase_p_decrease_s(self, param_id, min_val, max_val, step_change):
        next_step_size = self.last_step[param_id] - step_change
        if self.last_action[param_id] + next_step_size > max_val:
            # invalid move
            pass
        else:
            # Do increase step size and decrease parameter value
            self.last_step[param_id] -= step_change
            self.last_action[param_id] += self.last_step[param_id]

    def _increase_p_dna(self, param_id, min_val, max_val, step_change):
        if self.last_action[param_id] + self.last_step[param_id] > max_val:
            # invalid move
            pass
        else:
            # Do increase parameter value
            self.last_action[param_id] += self.last_step[param_id]

    def _decrease_p_increase_s(self, param_id, min_val, max_val, step_change):
        next_step_size = self.last_step[param_id] + step_change
        if self.last_action[param_id] - next_step_size < min_val:
            # invalid move
            pass
        else:
            # Do decrease step size and increase parameter value
            self.last_step[param_id] += step_change
            self.last_action[param_id] -= self.last_step[param_id]

    def _decrease_p_s(self, param_id, min_val, max_val, step_change):
        next_step_size = self.last_step[param_id] - step_change
        if self.last_action[param_id] - next_step_size < min_val:
            # invalid move
            pass
        else:
            # Do decrease step size and parameter value
            self.last_step[param_id] -= step_change
            self.last_action[param_id] -= self.last_step[param_id]

    def _decrease_p_dna(self, param_id, min_val, max_val, step_change):
        if self.last_action[param_id] - self.last_step[param_id] < min_val:
            # invalid move
            pass
        else:
            # Do decrease parameter value
            self.last_action[param_id] -= self.last_step[param_id]
Beispiel #6
0
performances = []

try:
    for game_idx in range(2000):
        game = DiscreteHill()
        game_iterations = 0

        observation = game.observe()

        prev_frames = [(observation, -1)] * (n_prev_frames - 1)
        memory = np.concatenate(
            [np.concatenate([observation, np.array([-1])])] *
            (n_prev_frames - 1) + [observation])

        while game_iterations < 50 and not game.is_over():
            action = current_controller.action(memory)
            if n_prev_frames > 1:
                prev_frames = prev_frames[1:] + [(observation, action)]
            reward = game.collect_reward(action)
            game.perform_action(action)
            observation = game.observe()
            new_memory = np.concatenate(
                [np.concatenate([a, np.array([b])])
                 for (a, b) in prev_frames] + [observation])
            current_controller.store(memory, action, reward, new_memory)
            current_controller.training_step()
            memory = new_memory
            game_iterations += 1
            cost = abs(game.target[0]) + abs(game.target[1])
        performances.append((game_iterations - cost) / float(cost))
        if game_idx % 100 == 0: