Esempio n. 1
0
    def __init__(self, data_df):
        super(ElectricityFormatter, self).__init__()
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
        """
        # Attribute loading the data
        self.data = data_df.reset_index(drop=True)

        self.id_col = get_single_col_by_input_type(InputTypes.ID,
                                                   self._column_definition)
        self.time_col = get_single_col_by_input_type(InputTypes.TIME,
                                                     self._column_definition)
        self.target_col = get_single_col_by_input_type(InputTypes.TARGET,
                                                       self._column_definition)
        self.input_cols = [
            tup[0] for tup in self._column_definition
            if tup[2] not in {InputTypes.ID, InputTypes.TIME}
        ]
        self.col_mappings = {
            'identifier': [self.id_col],
            'time': [self.time_col],
            'outputs': [self.target_col],
            'inputs': self.input_cols
        }
        self.lookback = self.get_time_steps()
        self.num_encoder_steps = self.get_num_encoder_steps()

        self.data_index = self.get_index_filtering()
        self.group_size = self.data.groupby(
            [self.id_col]).apply(lambda x: x.shape[0]).mean()
        self.data_index = self.data_index[
            self.data_index.end_rel < self.group_size].reset_index()
Esempio n. 2
0
    def set_scalers(self, df):
        """Calibrates scalers using the data supplied.
            Args:
              df: Data to use to calibrate scalers.
        """
        print('Setting scalers with training data...')

        column_definitions = self.get_column_definition()
        id_column = get_single_col_by_input_type(InputTypes.ID,
                                                 column_definitions)
        target_column = get_single_col_by_input_type(InputTypes.TARGET,
                                                     column_definitions)

        # Format real scalers
        real_inputs = extract_cols_from_data_type(
            DataTypes.REAL_VALUED, column_definitions,
            {InputTypes.ID, InputTypes.TIME})

        print('Real Scalers')
        # Initialise scaler caches
        self._real_scalers = {}
        self._target_scaler = {}
        identifiers = []
        for identifier, sliced in df.groupby(id_column):
            print('{} - {}'.format(identifier, len(sliced)))
            if len(sliced) >= self._time_steps:

                data = sliced[real_inputs].values
                targets = sliced[[target_column]].values
                self._real_scalers[identifier] \
                = sklearn.preprocessing.StandardScaler().fit(data)

                self._target_scaler[identifier] \
                = sklearn.preprocessing.StandardScaler().fit(targets)
                identifiers.append(identifier)

        # Format categorical scalers
        categorical_inputs = extract_cols_from_data_type(
            DataTypes.CATEGORICAL, column_definitions,
            {InputTypes.ID, InputTypes.TIME})

        print('Categorical Scalers')
        categorical_scalers = {}
        num_classes = []
        for col in categorical_inputs:
            print(col)
            # Set all to str so that we don't have mixed integer/string columns
            srs = df[col]  #.astype(str)
            categorical_scalers[col] = sklearn.preprocessing.LabelEncoder(
            ).fit(srs.values)
            num_classes.append(srs.nunique())

        # Set categorical scaler outputs
        self._cat_scalers = categorical_scalers
        self._num_classes_per_cat_input = num_classes

        # Extract identifiers in case required
        self.identifiers = identifiers
Esempio n. 3
0
    def __init__(self):
        """Initialises formatter."""

        self.identifiers = None
        self._real_scalers = None
        self._cat_scalers = None
        self._target_scaler = None
        self._num_classes_per_cat_input = None
        self._time_steps = self.get_fixed_params()['total_time_steps']
        self._num_encoder_steps = self.get_fixed_params()['num_encoder_steps']
        # Extract relevant columns
        self._column_definitions = self.get_column_definition()
        self._id_col = get_single_col_by_input_type(InputTypes.ID,
                                                    self._column_definitions)
        self._target_column = get_single_col_by_input_type(
            InputTypes.TARGET, self._column_definitions)
        self._real_inputs = extract_cols_from_data_type(
            DataTypes.REAL_VALUED, self._column_definitions,
            {InputTypes.ID, InputTypes.TIME})
        self._categorical_inputs = extract_cols_from_data_type(
            DataTypes.CATEGORICAL, self._column_definitions,
            {InputTypes.ID, InputTypes.TIME})
Esempio n. 4
0
    def transform_inputs(self, df):
        """Performs feature transformations.
        This includes both feature engineering, preprocessing and normalisation.
        Args:
          df: Data frame to transform.
        Returns:
          Transformed data frame.
        """
        print('Transforming the training data...')
        if self._real_scalers is None and self._cat_scalers is None:
            raise ValueError('Scalers have not been set!')

        # Extract relevant columns
        column_definitions = self.get_column_definition()
        id_col = get_single_col_by_input_type(InputTypes.ID,
                                              column_definitions)
        real_inputs = extract_cols_from_data_type(
            DataTypes.REAL_VALUED, column_definitions,
            {InputTypes.ID, InputTypes.TIME})
        categorical_inputs = extract_cols_from_data_type(
            DataTypes.CATEGORICAL, column_definitions,
            {InputTypes.ID, InputTypes.TIME})

        # Transform real inputs per entity
        df_list = []
        print('Real Features Transform')
        for identifier, sliced in df.groupby(id_col):
            print('{} - {}'.format(identifier, len(sliced)))
            # Filter out any trajectories that are too short
            if len(sliced) >= self._time_steps:

                sliced_copy = sliced.copy()
                sliced_copy[real_inputs] = self._real_scalers[
                    identifier].transform(sliced_copy[real_inputs].values)
                df_list.append(sliced_copy)

        output = pd.concat(df_list, axis=0)

        print('Categorical Features Transform')
        # Format categorical inputs
        for col in categorical_inputs:
            print(col)
            string_df = df[col]  #.apply(str)
            output[col] = self._cat_scalers[col].transform(string_df)

        return output