Beispiel #1
0
def test__assemble_sequences_context_error():
    """If context is not constant within an entity, raise an error."""
    entity_columns = ['a']
    context_columns = ['b']

    data = pd.DataFrame({
        'a': [1, 1, 1, 1, 2, 2, 2, 2],
        'b': [1, 1, 2, 2, 3, 3, 4, 4],
        'c': [9, 8, 7, 6, 5, 4, 3, 2],
    })
    with pytest.raises(ValueError):
        assemble_sequences(data, entity_columns, context_columns, 2, None)
Beispiel #2
0
    def _fit(self, timeseries_data):
        self._model = self._build_model()

        self._output_columns = list(timeseries_data.columns)
        self._data_columns = [
            column for column in timeseries_data.columns
            if column not in self._entity_columns + self._context_columns
        ]

        sequences = assemble_sequences(timeseries_data,
                                       self._entity_columns,
                                       self._context_columns,
                                       self._segment_size,
                                       self._sequence_index,
                                       drop_sequence_index=False)

        data_types = list()
        context_types = list()
        for field, meta in self._metadata.get_fields().items():
            data_type = self._DATA_TYPES.get(meta['type'])
            if data_type:
                if field == self._sequence_index:
                    data_types.append('continuous')
                elif field in self._data_columns:
                    data_types.append(data_type)
                elif field in self._context_columns:
                    context_types.append(data_type)

        if self._sequence_index:
            self._transform_sequence_index(sequences)
            data_types.append('continuous')

        # Validate and fit
        self._model.fit_sequences(sequences, context_types, data_types)
Beispiel #3
0
def test__assemble_sequences_entity_and_time_segment_size():
    """If entity columns and segment_size, group by and then segment."""
    entity_columns = ['a']
    context_columns = []

    data = pd.DataFrame({
        'a': [1, 1, 1, 1],
        'b': [1, 2, 3, 4],
        'c': [9, 8, 7, 6],
        'time': pd.date_range(start='2001-01-01', periods=4, freq='1d'),
    })
    out = assemble_sequences(data, entity_columns, context_columns, pd.to_timedelta('2d'), 'time')

    assert isinstance(out, list)
    assert out == [
        {
            'context': [],
            'data': [
                [1, 2],
                [9, 8],
            ],
        },
        {
            'context': [],
            'data': [
                [3, 4],
                [7, 6],
            ],
        },
    ]
Beispiel #4
0
    def _fit(self, timeseries_data):
        self._model = self._build_model()

        if self._sequence_index:
            timeseries_data = timeseries_data.rename(
                columns={
                    self._sequence_index + '.value': self._sequence_index
                })

        self._output_columns = list(timeseries_data.columns)
        self._data_columns = [
            column for column in timeseries_data.columns
            if column not in self._entity_columns + self._context_columns
        ]

        sequences = assemble_sequences(timeseries_data,
                                       self._entity_columns,
                                       self._context_columns,
                                       self._segment_size,
                                       self._sequence_index,
                                       drop_sequence_index=False)

        data_types = list()
        context_types = list()
        for field in self._output_columns:
            dtype = timeseries_data[field].dtype
            kind = dtype.kind
            if kind in ('i', 'f'):
                data_type = 'continuous'
            elif kind in ('O', 'b'):
                data_type = 'categorical'
            else:
                raise ValueError(f'Unsupported dtype {dtype}')

            if field in self._data_columns:
                data_types.append(data_type)
            elif field in self._context_columns:
                context_types.append(data_type)

        if self._sequence_index:
            self._transform_sequence_index(sequences)
            data_types.append('continuous')

        # Validate and fit
        self._model.fit_sequences(sequences, context_types, data_types)
Beispiel #5
0
    def _get_evaluation_data(self, segment_size):
        sequences = assemble_sequences(self.data, self.entity_columns,
                                       self.context_columns, segment_size,
                                       self.sequence_index)
        evaluation_data = pd.DataFrame(columns=self.data.columns)
        for idx, sequence in enumerate(sequences):
            sequence_df = pd.DataFrame(sequence['data'],
                                       index=self.model_columns).T
            for column, value in zip(self.context_columns,
                                     sequence['context']):
                sequence_df[column] = value

            for column in self.entity_columns:
                sequence_df[column] = idx

            evaluation_data = evaluation_data.append(sequence_df)

        return evaluation_data
Beispiel #6
0
def test__assemble_sequences_no_entity_no_context():
    """If no entity_columns, segment the given data."""
    entity_columns = []
    context_columns = []

    data = pd.DataFrame({
        'a': [1, 2, 3, 4, 5, 6],
        'b': [9, 8, 7, 6, 5, 4],
    })
    out = assemble_sequences(data, entity_columns, context_columns, 3, None)

    assert isinstance(out, list)
    assert out == [
        {
            'context': [],
            'data': [[1, 2, 3], [9, 8, 7]],
        },
        {
            'context': [],
            'data': [[4, 5, 6], [6, 5, 4]],
        },
    ]
Beispiel #7
0
def test__assemble_sequences_entity_and_segment_size():
    """If entity columns and segment_size, group by and then segment."""
    entity_columns = ['a']
    context_columns = []

    data = pd.DataFrame({
        'a': [1, 1, 1, 1, 1, 1],
        'b': [1, 2, 3, 4, 5, 6],
        'c': [9, 8, 7, 6, 5, 4],
    })
    out = assemble_sequences(data, entity_columns, context_columns, 3, None)

    assert isinstance(out, list)
    assert out == [
        {
            'context': [],
            'data': [[1, 2, 3], [9, 8, 7]],
        },
        {
            'context': [],
            'data': [[4, 5, 6], [6, 5, 4]],
        },
    ]
Beispiel #8
0
    def fit(self,
            data,
            entity_columns=None,
            context_columns=None,
            data_types=None,
            segment_size=None,
            sequence_index=None):
        """Fit the model to a dataframe containing time series data.

        Args:
            data (pd.DataFrame):
                DataFrame containing all the timeseries data alongside the
                entity and context columns.
            entity_columns (list[str]):
                Names of the columns which identify different time series
                sequences. These will be used to group the data in separated
                training examples.
            context_columns (list[str]):
                The columns in the dataframe which are constant within each
                group/entity. These columns will be provided at sampling time
                (i.e. the samples will be conditioned on the context variables).
            data_types (dict[str, str]):
                Dictinary indicating the data types of each column, which can be
                ``categorical``, ``continuous`` or ``datetime``.
            segment_size (int, pd.Timedelta or str):
                If specified, cut each training sequence in several segments of the
                indicated size. The size can either can passed as an integer value,
                which will interpreted as the number of data points to put on each
                segment, or as a pd.Timedelta (or equivalent str representation),
                which will be interpreted as the segment length in time. Timedelta
                segment sizes can only be used with sequence indexes of type datetime.
            sequence_index (str):
                Name of the column that acts as the order index of each sequence.
                The sequence index column can be of any type that can be sorted,
                such as integer values or datetimes.
        """
        if not entity_columns and segment_size is None:
            raise TypeError(
                'If the data has no `entity_columns`, `segment_size` must be given.'
            )
        if segment_size is not None and not isinstance(segment_size, int):
            if sequence_index is None:
                raise TypeError('`segment_size` must be of type `int` if '
                                'no `sequence_index` is given.')
            if data[sequence_index].dtype.kind != 'M':
                raise TypeError('`segment_size` must be of type `int` if '
                                '`sequence_index` is not a `datetime` column.')

            segment_size = pd.to_timedelta(segment_size)

        self._output_columns = list(data.columns)
        self._entity_columns = entity_columns or []
        self._context_columns = context_columns or []
        self._data_columns = [
            column for column in data.columns
            if column not in self._entity_columns + self._context_columns
        ]
        if sequence_index:
            self._output_columns.remove(sequence_index)
            self._data_columns.remove(sequence_index)

        data_types = self._get_data_types(data, data_types, self._data_columns)
        context_types = self._get_data_types(data, data_types,
                                             self._context_columns)
        sequences = assemble_sequences(data, self._entity_columns,
                                       self._context_columns, segment_size,
                                       sequence_index)

        # Validate and fit
        self._validate(sequences, context_types, data_types)
        self.fit_sequences(sequences, context_types, data_types)

        # Store context values
        self._context_values = data[self._context_columns]