Beispiel #1
0
    def compute(X, y, machinelearning_task):

        columns_info = {}
        for col in X.columns:
            columns_info[col] = []
            #
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            if empty_column:
                columns_info[col] += ["empty_column"]
                continue
            #
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]),
                                                  col])) == 1
            if constant_column:
                columns_info[col] += ["constant_column"]
                continue
            #
            if PreprocessingUtils.is_na(X[col]):
                columns_info[col] += ["missing_values"]
            #
            if PreprocessingUtils.is_categorical(X[col]):
                columns_info[col] += ["categorical"]
                columns_info[col] += [EncodingSelector.get(X, y, col)]
            elif PreprocessingUtils.is_datetime(X[col]):
                columns_info[col] += ["datetime_transform"]
            elif PreprocessingUtils.is_text(X[col]):
                columns_info[col] = ["text_transform"
                                     ]  # override other transforms
            else:
                # numeric type, check if scale needed
                if PreprocessingUtils.is_scale_needed(X[col]):
                    columns_info[col] += ["scale"]

        target_info = []
        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_info += ["convert_0_1"]

        if machinelearning_task == REGRESSION:
            if PreprocessingUtils.is_log_scale_needed(y):
                target_info += ["scale_log"]
            elif PreprocessingUtils.is_scale_needed(y):
                target_info += ["scale"]

        num_class = None
        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            num_class = PreprocessingUtils.num_class(y)

        return {
            "columns_info": columns_info,
            "target_info": target_info,
            "num_class": num_class,
        }
Beispiel #2
0
    def get(required_preprocessing, data, machinelearning_task):

        X = data["train"]["X"]
        y = data["train"]["y"]

        columns_preprocessing = {}
        for col in X.columns:
            preprocessing_to_apply = []

            # remove empty columns and columns with only one variable
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
            if empty_column or constant_column:
                preprocessing_to_apply += ["remove_column"]
                columns_preprocessing[col] = preprocessing_to_apply
                continue

            # always check for missing values
            if (
                "missing_values_inputation" in required_preprocessing
                and PreprocessingUtils.is_na(X[col])
            ):
                preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN]
            # convert to categorical only for categorical types
            convert_to_integer_will_be_applied = False
            if (
                "convert_categorical" in required_preprocessing
                and PreprocessingUtils.is_categorical(X[col])
            ):
                preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
                convert_to_integer_will_be_applied = True

            if "scale" in required_preprocessing:
                if convert_to_integer_will_be_applied:
                    preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL]
                # elif PreprocessingUtils.is_log_scale_needed(X[col]):
                #    preprocessing_to_apply += [PreprocessingScale.SCALE_LOG_AND_NORMAL]
                elif PreprocessingUtils.is_scale_needed(X[col]):
                    preprocessing_to_apply += [PreprocessingScale.SCALE_NORMAL]

            # remeber which preprocessing we need to apply
            if preprocessing_to_apply:
                columns_preprocessing[col] = preprocessing_to_apply

        target_preprocessing = []
        # always remove missing values from target,
        # missing values might be in train and in validation datasets
        target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE]

        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            if PreprocessingUtils.is_categorical(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == REGRESSION:
            if PreprocessingUtils.is_log_scale_needed(y):
                target_preprocessing += [PreprocessingScale.SCALE_LOG_AND_NORMAL]
            elif PreprocessingUtils.is_scale_needed(y):
                target_preprocessing += [PreprocessingScale.SCALE_NORMAL]

        return {
            "columns_preprocessing": columns_preprocessing,
            "target_preprocessing": target_preprocessing,
        }
Beispiel #3
0
    def get(required_preprocessing, data, machinelearning_task):

        X = data["train"]["X"]
        y = data["train"]["y"]

        columns_preprocessing = {}
        for col in X.columns:
            preprocessing_to_apply = []

            # remove empty columns and columns with only one variable
            empty_column = np.sum(pd.isnull(X[col]) == True) == X.shape[0]
            constant_column = len(np.unique(X.loc[~pd.isnull(X[col]), col])) == 1
            if empty_column or constant_column:
                preprocessing_to_apply += ["remove_column"]
                columns_preprocessing[col] = preprocessing_to_apply
                continue

            # always check for missing values
            if (
                "missing_values_inputation" in required_preprocessing
                and PreprocessingUtils.is_na(X[col])
            ):
                preprocessing_to_apply += [PreprocessingMissingValues.FILL_NA_MEDIAN]
            # convert to categorical only for categorical types
            convert_to_integer_will_be_applied = False
            if (
                "convert_categorical" in required_preprocessing
                and PreprocessingUtils.is_categorical(X[col])
            ):
                preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
                convert_to_integer_will_be_applied = True

            if "scale" in required_preprocessing:
                if convert_to_integer_will_be_applied:
                    preprocessing_to_apply += [Scale.SCALE_NORMAL]
                # elif PreprocessingUtils.is_log_scale_needed(X[col]):
                #    preprocessing_to_apply += [Scale.SCALE_LOG_AND_NORMAL]
                elif PreprocessingUtils.is_scale_needed(X[col]):
                    preprocessing_to_apply += [Scale.SCALE_NORMAL]

            # remeber which preprocessing we need to apply
            if preprocessing_to_apply:
                columns_preprocessing[col] = preprocessing_to_apply

        target_preprocessing = []
        # always remove missing values from target,
        # target with missing values might be in the train and in the validation datasets
        target_preprocessing += [PreprocessingMissingValues.NA_EXCLUDE]

        if "target_as_integer" in required_preprocessing:
            if machinelearning_task == BINARY_CLASSIFICATION:
                if not PreprocessingUtils.is_0_1(y):
                    target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

            if machinelearning_task == MULTICLASS_CLASSIFICATION:
                # if PreprocessingUtils.is_categorical(y):
                # always convert to integer, there can be many situations that can break
                # for example, classes starting from 1, ...
                # or classes not for every number, for example 0,2,3,4
                # just always convert
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        elif "target_as_one_hot" in required_preprocessing:
            target_preprocessing += [PreprocessingCategorical.CONVERT_ONE_HOT]

        if (
            machinelearning_task == REGRESSION
            and "target_scale" in required_preprocessing
        ):
            if PreprocessingUtils.is_log_scale_needed(y):
                target_preprocessing += [Scale.SCALE_LOG_AND_NORMAL]
            elif PreprocessingUtils.is_scale_needed(y):
                target_preprocessing += [Scale.SCALE_NORMAL]

        """    
        if machinelearning_task == BINARY_CLASSIFICATION:
            if not PreprocessingUtils.is_0_1(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        if machinelearning_task == MULTICLASS_CLASSIFICATION:
            if PreprocessingUtils.is_categorical(y):
                target_preprocessing += [PreprocessingCategorical.CONVERT_INTEGER]

        """
        return {
            "columns_preprocessing": columns_preprocessing,
            "target_preprocessing": target_preprocessing,
            "ml_task": machinelearning_task,
        }