Exemple #1
0
    def _sample_aligned_interval(
            self,
            interval,
            align_left=False,
            b_alpha=1.0,
            b_beta=1.0,
            name='interval_sample_',
            force_interval=False,
            **kwargs
    ):
        """
        Samples continuous subset of data,
        such as entire episode records lie within positions specified by interval
        Episode start position within interval is drawn from beta-distribution parametrised by `b_alpha, b_beta`.
        By default distribution is uniform one.

        Args:
            interval:       tuple, list or 1d-array of integers of length 2: [lower_row_number, upper_row_number];
            align:          if True - try to align sample to beginning of interval;
            b_alpha:        float > 0, sampling B-distribution alpha param, def=1;
            b_beta:         float > 0, sampling B-distribution beta param, def=1;
            name:           str, sample filename id
            force_interval: bool,  if true: force exact interval sampling

        Returns:
             - BTgymDataset instance such as:
                1. number of records ~ max_episode_len, subj. to `time_gap` param;
                2. actual episode start position is sampled from `interval`;
             - `False` if it is not possible to sample instance with set args.
        """
        try:
            assert not self.data.empty

        except (AssertionError, AttributeError) as e:
            self.log.exception('Instance holds no data. Hint: forgot to call .read_csv()?')
            raise AssertionError

        try:
            assert len(interval) == 2

        except AssertionError:
            self.log.exception(
                'Invalid interval arg: expected list or tuple of size 2, got: {}'.format(interval)
            )
            raise AssertionError

        if force_interval:
            return self._sample_exact_interval(interval, name)

        try:
            assert b_alpha > 0 and b_beta > 0

        except AssertionError:
            self.log.exception(
                'Expected positive B-distribution [alpha, beta] params, got: {}'.format([b_alpha, b_beta])
            )
            raise AssertionError

        sample_num_records = self.sample_num_records

        self.log.debug('Maximum sample time duration set to: {}.'.format(self.max_sample_len_delta))
        self.log.debug('Respective number of steps: {}.'.format(sample_num_records))
        self.log.debug('Maximum allowed data time gap set to: {}.\n'.format(self.max_time_gap))

        # Sanity check param:
        if align_left:
            max_attempts = interval[-1] - interval[0]
        else:
            # Sanity check:
            max_attempts = 100

        attempts = 0
        align_shift = 0

        # Sample enter point as close to beginning  until all conditions are met:
        while attempts <= max_attempts:
            if align_left:
                first_row = interval[0] + align_shift

            else:
                first_row = interval[0] + int(
                    (interval[-1] - interval[0] - sample_num_records) * random_beta(a=b_alpha, b=b_beta)
                )

            #print('_sample_interval_sample_num_records: ', sample_num_records)
            self.log.debug('_sample_interval_first_row: {}'.format(first_row))

            sample_first_day = self.data[first_row:first_row + 1].index[0]
            self.log.debug('Sample start: {}, weekday: {}.'.format(sample_first_day, sample_first_day.weekday()))

            # Keep sampling until good day:
            while not sample_first_day.weekday() in self.start_weekdays and attempts <= max_attempts:
                align_shift += 1

                self.log.debug('Not a good day to start, resampling...')

                if align_left:
                    first_row = interval[0] + align_shift
                else:

                    first_row = interval[0] + int(
                        (interval[-1] - interval[0] - sample_num_records) * random_beta(a=b_alpha, b=b_beta)
                    )
                #print('r_sample_interval_sample_num_records: ', sample_num_records)
                self.log.debug('_sample_interval_first_row: {}'.format(first_row))

                sample_first_day = self.data[first_row:first_row + 1].index[0]

                self.log.debug('Sample start: {}, weekday: {}.'.format(sample_first_day, sample_first_day.weekday()))

                attempts += 1

            # Check if managed to get proper weekday:
            try:
                assert attempts <= max_attempts

            except AssertionError:
                self.log.exception(
                    'Quitting after {} sampling attempts. Hint: check sampling params / dataset consistency.'.
                    format(attempts)
                )
                raise RuntimeError

            # If 00 option set, get index of first record of that day:
            if self.start_00:
                adj_timedate = sample_first_day.date()
                self.log.debug('Start time adjusted to <00:00>')
                first_row = self.data.index.get_loc(adj_timedate, method='nearest')

            else:
                adj_timedate = sample_first_day

            # first_row = self.data.index.get_loc(adj_timedate, method='nearest')

            # Easy part:
            last_row = first_row + sample_num_records  # + 1
            sampled_data = self.data[first_row: last_row]
            sample_len = (sampled_data.index[-1] - sampled_data.index[0]).to_pytimedelta()
            self.log.debug('Actual sample duration: {}.'.format(sample_len))
            self.log.debug('Total sample time gap: {}.'.format(sample_len - self.max_sample_len_delta))

            # Perform data gap check:
            if sample_len - self.max_sample_len_delta < self.max_time_gap:
                self.log.debug('Sample accepted.')
                # If sample OK - return new dataset:
                new_instance = self.nested_class_ref(**self.nested_params)
                new_instance.filename = name + 'num_{}_at_{}'.format(self.sample_num, adj_timedate)
                self.log.info('New sample id: <{}>.'.format(new_instance.filename))
                new_instance.data = sampled_data
                new_instance.metadata['type'] = 'interval_sample'
                new_instance.metadata['first_row'] = first_row
                new_instance.metadata['last_row'] = last_row

                return new_instance

            else:
                self.log.debug('Attempt {}: duration too big, resampling, ...\n'.format(attempts))
                attempts += 1
                align_shift += 1

        # Got here -> sanity check failed:
        msg = ('Quitting after {} sampling attempts.' +
               'Hint: check sampling params / dataset consistency.').format(attempts)
        self.log.error(msg)
        raise RuntimeError(msg)
Exemple #2
0
    def _sample_interval(self, interval, b_alpha=1, b_beta=1):
        """
        Samples continuous subset of data,
        such as entire episode records lie within positions specified by interval or.
        Episode start position within interval is drawn from beta-distribution parametrised by `b_alpha, b_beta`.
        By default distribution is uniform one.

        Args:
            interval:       tuple, list or 1d-array of integers of length 2: [lower_position, upper_position];
            b_alpha:        sampling B-distribution alpha param;
            b_beta:         sampling B-distribution beta param;


        Returns:
             - BTgymDataset instance such as:
                1. number of records ~ max_episode_len, subj. to `time_gap` param;
                2. actual episode start position is sampled from `interval`;
             - `False` if it is not possible to sample instance with set args.
        """
        try:
            assert not self.data.empty

        except (AssertionError, AttributeError) as e:
            raise  AssertionError('BTgymDataset instance holds no data. Hint: forgot to call .read_csv()?')

        assert len(interval) == 2, 'Invalid interval arg: expected list or tuple of size 2, got: {}'.format(interval)

        sample_num_records = self.episode_num_records

        assert interval[0] < interval[-1] < int(self.data.shape[0] - sample_num_records), \
            'Cannot sample with size {}, in {} from dataset of {} records'.\
             format(sample_num_records, interval, self.data.shape[0])

        self.log.debug('Maximum sample time duration set to: {}.'.format(self.max_episode_len))
        self.log.debug('Respective number of steps: {}.'.format(sample_num_records))
        self.log.debug('Maximum allowed data time gap set to: {}.\n'.format(self.max_time_gap))

        # Sanity check param:
        max_attempts = 100
        attempts = 0

        # # Keep sampling random enter points until all conditions are met:
        while attempts <= max_attempts:

            first_row = interval[0] + round(
                (interval[-1] - interval[0] - sample_num_records - 1) * random_beta(a=b_alpha, b=b_beta)
            )

            episode_first_day = self.data[first_row:first_row + 1].index[0]
            self.log.debug('Sample start: {}, weekday: {}.'.format(episode_first_day, episode_first_day.weekday()))

            # Keep sampling until good day:
            while not episode_first_day.weekday() in self.start_weekdays and attempts <= max_attempts:
                self.log.debug('Not a good day to start, resampling...')
                first_row = interval[0] + round(
                    (interval[-1] - interval[0] - sample_num_records - 1) * random_beta(a=b_alpha, b=b_beta)
                )
                episode_first_day = self.data[first_row:first_row + 1].index[0]
                self.log.debug('Sample start: {}, weekday: {}.'.format(episode_first_day, episode_first_day.weekday()))
                attempts += 1

            # Check if managed to get proper weekday:
            assert attempts <= max_attempts, \
                'Quitting after {} sampling attempts. Hint: check sampling params / dataset consistency.'.\
                format(attempts)

            # If 00 option set, get index of first record of that day:
            if self.start_00:
                adj_timedate = episode_first_day.date()
                self.log.debug('Start time adjusted to <00:00>')

            else:
                adj_timedate = episode_first_day

            first_row = self.data.index.get_loc(adj_timedate, method='nearest')

            # Easy part:
            last_row = first_row + sample_num_records  # + 1
            episode_sample = self.data[first_row: last_row]
            episode_sample_len = (episode_sample.index[-1] - episode_sample.index[0]).to_pytimedelta()
            self.log.debug('Sample duration: {}.'.format(episode_sample_len, ))
            self.log.debug('Total sample time gap: {}.'.format(episode_sample_len - self.max_episode_len))

            # Perform data gap check:
            if episode_sample_len - self.max_episode_len < self.max_time_gap:
                self.log.debug('Sample accepted.')
                # If sample OK - return episodic-dataset:
                episode = self.__class__(**self.params)
                episode.filename = '_btgym_interval_sample_' + str(adj_timedate)
                self.log.info('Sample id: <{}>.'.format(episode.filename))
                episode.data = episode_sample
                episode.metadata['type'] = 'interval_sample'
                episode.metadata['first_row'] = first_row
                return episode

            else:
                self.log.debug('Attempt {}: duration too big, resampling, ...\n'.format(attempts))
                attempts += 1

        # Got here -> sanity check failed:
        msg = ('Quitting after {} sampling attempts.' +
               'Hint: check sampling params / dataset consistency.').format(attempts)
        self.log.warning(msg)
        raise AssertionError(msg)