Exemple #1
0
    def normalize(self, columns):
        """
        Normalize numeric columns
        :param columns: list of column names
        :return: FeatureTable
        """
        df = self.df
        types = [x[1] for x in self.df.select(*columns).dtypes]
        scalar_cols = [
            columns[i] for i in range(len(columns))
            if types[i] == "int" or types[i] == "bigint" or types[i] == "float"
            or types[i] == "double"
        ]
        array_cols = [
            columns[i] for i in range(len(columns))
            if types[i] == "array<int>" or types[i] == "array<bigint>"
            or types[i] == "array<float>" or types[i] == "array<double>"
        ]
        vector_cols = [
            columns[i] for i in range(len(columns)) if types[i] == "vector"
        ]
        if scalar_cols:
            assembler = VectorAssembler(inputCols=scalar_cols,
                                        outputCol="vect")

            # MinMaxScaler Transformation
            scaler = MinMaxScaler(inputCol="vect", outputCol="scaled")

            # Pipeline of VectorAssembler and MinMaxScaler
            pipeline = Pipeline(stages=[assembler, scaler])

            tolist = udf(lambda x: x.toArray().tolist(),
                         ArrayType(DoubleType()))

            # Fitting pipeline on dataframe
            df = pipeline.fit(df).transform(df) \
                .withColumn("scaled_list", tolist(pyspark_col("scaled"))) \
                .drop("vect").drop("scaled")
            for i in range(len(scalar_cols)):
                df = df.withColumn(scalar_cols[i],
                                   pyspark_col("scaled_list")[i])
            df = df.drop("scaled_list")

            # cast to float
            for c in scalar_cols:
                df = df.withColumn(c, pyspark_col(c).cast("float"))

        for c in array_cols:
            df = normalize_array(df, c)

        for c in vector_cols:
            scaler = MinMaxScaler(inputCol=c, outputCol="scaled")
            df = scaler.fit(df).transform(df).withColumnRenamed("scaled", c)

        return FeatureTable(df)
Exemple #2
0
    def add_feature(self, item_cols, feature_tbl, default_value):
        """
        Get the category or other field from another map like FeatureTable

        :param item_cols: list[string]
        :param feature_tbl: FeatureTable with two columns [category, item]
        :param defalut_cat_index: default value for category if key does not exist

        :return: FeatureTable
        """
        item2cat_map = dict(feature_tbl.df.distinct().rdd.map(lambda row: (row[0], row[1]))
                            .collect())

        def gen_cat(items):
            getcat = lambda item: item2cat_map.get(item, default_value)
            if isinstance(items, int):
                cats = getcat(items)
            elif isinstance(items, list) and isinstance(items[0], int):
                cats = [getcat(item) for item in items]
            elif isinstance(items, list) and isinstance(items[0], list) and isinstance(items[0][0],
                                                                                       int):
                cats = []
                for line in items:
                    line_cats = [getcat(item) for item in line]
                    cats.append(line_cats)
            else:
                raise ValueError('only int, list[int], and list[list[int]] are supported.')
            return cats

        df = self.df
        for c in item_cols:
            col_type = df.schema[c].dataType
            cat_udf = udf(gen_cat, col_type)
            df = df.withColumn(c.replace("item", "category"), cat_udf(pyspark_col(c)))
        return FeatureTable(df)
Exemple #3
0
    def cast(self, columns, type):
        """
        Cast columns to the specified type.

        :param columns: a string or a list of strings that specifies column names.
                        If it is None, then cast all of the columns.
        :param type: a string ("string", "boolean", "int", "long", "short", "float", "double")
                     that specifies the type.

        :return: A new Table that casts all of the specified columns to the specified type.
        """
        if columns is None:
            columns = self.df.columns
        elif not isinstance(columns, list):
            columns = [columns]
            check_col_exists(self.df, columns)
        valid_types = ["str", "string", "bool", "boolean", "int",
                       "integer", "long", "short", "float", "double"]
        if not (isinstance(type, str) and (type in valid_types)) \
           and not isinstance(type, DataType):
            raise ValueError(
                "type should be string, boolean, int, long, short, float, double.")
        transform_dict = {"str": "string", "bool": "boolean", "integer": "int"}
        type = transform_dict[type] if type in transform_dict else type
        df_cast = self._clone(self.df)
        for i in columns:
            df_cast.df = df_cast.df.withColumn(i, pyspark_col(i).cast(type))
        return df_cast
Exemple #4
0
    def transform_python_udf(self, in_col, out_col, udf_func):
        """
        Transform a FeatureTable using a python udf

        :param in_col: string, name of column needed to be transformed.
        :param out_col: string, output column.
        :param udf_func: user defined python function

        :return: FeatureTable
        """
        df = self.df.withColumn(out_col, udf_func(pyspark_col(in_col)))
        return FeatureTable(df)
Exemple #5
0
    def add(self, columns, value=1):
        """
        Increase all of values of the target numeric column(s) by a constant value.

        :param columns: str or list of str, the target columns to be increased.
        :param value: numeric (int/float/double/short/long), the constant value to be added.

        :return: A new Table with updated numeric values on specified columns.
        """
        if columns is None:
            raise ValueError("Columns should be str or list of str, but got None")
        if not isinstance(columns, list):
            columns = [columns]
        check_col_exists(self.df, columns)
        new_df = self.df
        for column in columns:
            if new_df.schema[column].dataType not in [IntegerType(), ShortType(),
                                                      LongType(), FloatType(),
                                                      DecimalType(), DoubleType()]:
                raise ValueError("Column type should be numeric, but have type {} \
                    for column {}".format(new_df.schema[column].dataType, column))
            new_df = new_df.withColumn(column, pyspark_col(column) + lit(value))
        return self._clone(new_df)
Exemple #6
0
 def col(self, name):
     return pyspark_col(name)