Exemple #1
0
    def deserialize(self, data: str) -> Any:
        df = pd.read_csv(StringIO(data))
        df = _pre_transform(
            [df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, self.target_codes
        )[0]

        cat_vars = _to_cat_vars_numpy(df, self.cat_cols)
        num_vars = _to_num_vars_numpy(df, self.num_cols)

        cat_vars = np.stack(cat_vars, 1)
        num_vars = np.stack(num_vars, 1)

        return [{DefaultDataKeys.INPUT: [c, n]} for c, n in zip(cat_vars, num_vars)]
Exemple #2
0
    def serve_load_sample(self, data: str) -> Any:
        parameters = self._parameters

        df = read_csv(StringIO(data))
        df = _pre_transform(
            df,
            parameters["numerical_fields"],
            parameters["categorical_fields"],
            parameters["codes"],
            parameters["mean"],
            parameters["std"],
        )

        cat_vars = _to_cat_vars_numpy(df, parameters["categorical_fields"])
        num_vars = _to_num_vars_numpy(df, parameters["numerical_fields"])

        cat_vars = np.stack(cat_vars, 1)
        num_vars = np.stack(num_vars, 1)

        return [{DataKeys.INPUT: [c, n]} for c, n in zip(cat_vars, num_vars)]
Exemple #3
0
    def deserialize(self, data: str) -> Any:
        columns = data.split("\n")[0].split(',')
        df = pd.DataFrame([
            TabularDeserializer._convert_row(x.split(',')[1:])
            for x in data.split('\n')[1:-1]
        ],
                          columns=columns)
        df = _pre_transform([df], self.num_cols, self.cat_cols, self.codes,
                            self.mean, self.std, self.target_col,
                            self.target_codes)[0]

        cat_vars = _to_cat_vars_numpy(df, self.cat_cols)
        num_vars = _to_num_vars_numpy(df, self.num_cols)

        cat_vars = np.stack(cat_vars, 1)
        num_vars = np.stack(num_vars, 1)

        return [{
            DefaultDataKeys.INPUT: [c, n]
        } for c, n in zip(cat_vars, num_vars)]
Exemple #4
0
    def preprocess(
        self,
        df: DataFrame,
        categorical_fields: Optional[List[str]] = None,
        numerical_fields: Optional[List[str]] = None,
        parameters: Dict[str, Any] = None,
    ):
        if self.training:
            categorical_fields, numerical_fields = self._sanetize_fields(
                categorical_fields, numerical_fields)
            parameters = self.compute_parameters(df, numerical_fields,
                                                 categorical_fields)
        elif parameters is None:
            raise MisconfigurationException(
                "Loading tabular data for evaluation or inference requires parameters from the train data. Either "
                "construct the train data at the same time as evaluation and inference or provide the train "
                "`datamodule.parameters` in the `parameters` argument.")

        self.parameters = parameters

        # impute and normalize data
        df = _pre_transform(
            df,
            parameters["numerical_fields"],
            parameters["categorical_fields"],
            parameters["codes"],
            parameters["mean"],
            parameters["std"],
        )

        cat_vars = _to_cat_vars_numpy(df, parameters["categorical_fields"])
        num_vars = _to_num_vars_numpy(df, parameters["numerical_fields"])

        num_samples = len(df)
        cat_vars = np.stack(cat_vars, 1) if len(cat_vars) else np.zeros(
            (num_samples, 0), dtype=np.int64)
        num_vars = np.stack(num_vars, 1) if len(num_vars) else np.zeros(
            (num_samples, 0), dtype=np.float32)

        return cat_vars, num_vars
Exemple #5
0
    def common_load_data(
        self,
        df: DataFrame,
        dataset: Optional[Any] = None,
    ):
        # impute_data
        # compute train dataset stats
        dfs = _pre_transform(
            [df], self.num_cols, self.cat_cols, self.codes, self.mean, self.std, self.target_col, self.target_codes
        )

        df = dfs[0]

        if dataset is not None:
            dataset.num_samples = len(df)

        cat_vars = _to_cat_vars_numpy(df, self.cat_cols)
        num_vars = _to_num_vars_numpy(df, self.num_cols)

        cat_vars = np.stack(cat_vars, 1)  # if len(cat_vars) else np.zeros((len(self), 0))
        num_vars = np.stack(num_vars, 1)  # if len(num_vars) else np.zeros((len(self), 0))
        return df, cat_vars, num_vars