def normalize(self, columns): """ Normalize numeric columns :param columns: list of column names :return: FeatureTable """ df = self.df types = [x[1] for x in self.df.select(*columns).dtypes] scalar_cols = [ columns[i] for i in range(len(columns)) if types[i] == "int" or types[i] == "bigint" or types[i] == "float" or types[i] == "double" ] array_cols = [ columns[i] for i in range(len(columns)) if types[i] == "array<int>" or types[i] == "array<bigint>" or types[i] == "array<float>" or types[i] == "array<double>" ] vector_cols = [ columns[i] for i in range(len(columns)) if types[i] == "vector" ] if scalar_cols: assembler = VectorAssembler(inputCols=scalar_cols, outputCol="vect") # MinMaxScaler Transformation scaler = MinMaxScaler(inputCol="vect", outputCol="scaled") # Pipeline of VectorAssembler and MinMaxScaler pipeline = Pipeline(stages=[assembler, scaler]) tolist = udf(lambda x: x.toArray().tolist(), ArrayType(DoubleType())) # Fitting pipeline on dataframe df = pipeline.fit(df).transform(df) \ .withColumn("scaled_list", tolist(pyspark_col("scaled"))) \ .drop("vect").drop("scaled") for i in range(len(scalar_cols)): df = df.withColumn(scalar_cols[i], pyspark_col("scaled_list")[i]) df = df.drop("scaled_list") # cast to float for c in scalar_cols: df = df.withColumn(c, pyspark_col(c).cast("float")) for c in array_cols: df = normalize_array(df, c) for c in vector_cols: scaler = MinMaxScaler(inputCol=c, outputCol="scaled") df = scaler.fit(df).transform(df).withColumnRenamed("scaled", c) return FeatureTable(df)
def add_feature(self, item_cols, feature_tbl, default_value): """ Get the category or other field from another map like FeatureTable :param item_cols: list[string] :param feature_tbl: FeatureTable with two columns [category, item] :param defalut_cat_index: default value for category if key does not exist :return: FeatureTable """ item2cat_map = dict(feature_tbl.df.distinct().rdd.map(lambda row: (row[0], row[1])) .collect()) def gen_cat(items): getcat = lambda item: item2cat_map.get(item, default_value) if isinstance(items, int): cats = getcat(items) elif isinstance(items, list) and isinstance(items[0], int): cats = [getcat(item) for item in items] elif isinstance(items, list) and isinstance(items[0], list) and isinstance(items[0][0], int): cats = [] for line in items: line_cats = [getcat(item) for item in line] cats.append(line_cats) else: raise ValueError('only int, list[int], and list[list[int]] are supported.') return cats df = self.df for c in item_cols: col_type = df.schema[c].dataType cat_udf = udf(gen_cat, col_type) df = df.withColumn(c.replace("item", "category"), cat_udf(pyspark_col(c))) return FeatureTable(df)
def cast(self, columns, type): """ Cast columns to the specified type. :param columns: a string or a list of strings that specifies column names. If it is None, then cast all of the columns. :param type: a string ("string", "boolean", "int", "long", "short", "float", "double") that specifies the type. :return: A new Table that casts all of the specified columns to the specified type. """ if columns is None: columns = self.df.columns elif not isinstance(columns, list): columns = [columns] check_col_exists(self.df, columns) valid_types = ["str", "string", "bool", "boolean", "int", "integer", "long", "short", "float", "double"] if not (isinstance(type, str) and (type in valid_types)) \ and not isinstance(type, DataType): raise ValueError( "type should be string, boolean, int, long, short, float, double.") transform_dict = {"str": "string", "bool": "boolean", "integer": "int"} type = transform_dict[type] if type in transform_dict else type df_cast = self._clone(self.df) for i in columns: df_cast.df = df_cast.df.withColumn(i, pyspark_col(i).cast(type)) return df_cast
def transform_python_udf(self, in_col, out_col, udf_func): """ Transform a FeatureTable using a python udf :param in_col: string, name of column needed to be transformed. :param out_col: string, output column. :param udf_func: user defined python function :return: FeatureTable """ df = self.df.withColumn(out_col, udf_func(pyspark_col(in_col))) return FeatureTable(df)
def add(self, columns, value=1): """ Increase all of values of the target numeric column(s) by a constant value. :param columns: str or list of str, the target columns to be increased. :param value: numeric (int/float/double/short/long), the constant value to be added. :return: A new Table with updated numeric values on specified columns. """ if columns is None: raise ValueError("Columns should be str or list of str, but got None") if not isinstance(columns, list): columns = [columns] check_col_exists(self.df, columns) new_df = self.df for column in columns: if new_df.schema[column].dataType not in [IntegerType(), ShortType(), LongType(), FloatType(), DecimalType(), DoubleType()]: raise ValueError("Column type should be numeric, but have type {} \ for column {}".format(new_df.schema[column].dataType, column)) new_df = new_df.withColumn(column, pyspark_col(column) + lit(value)) return self._clone(new_df)
def col(self, name): return pyspark_col(name)