def scaling(dataFrame, inputColName, Min, Max):
     outputColName = "scaled " + inputColName
     assembler = VectorAssembler(inputCols=[inputColName], \
                                 outputCol="features")
     assembledDF = assembler.transform(dataFrame)
     scaler=MinMaxScaler(inputCol="features", \
                         outputCol=outputColName)
     scaler.setMax(Max)\
           .setMin(Min)
     scalerModel=scaler.fit(assembledDF)
     scaledDF = scalerModel.transform(assembledDF).drop("features")
     castVectorToFloat = udf(lambda v : float(v[0]), FloatType())
     scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) 
     print ("Successfully scale the column '{0:s}' to range ({1:f}, {2:f}) and create a new column '{3:s}'."\
             .format(inputColName,scaler.getMin(), scaler.getMax(), outputColName))
     return scaledDF
class mmscaler_wrapper():
    mmModel = ''
    originalMin = ''
    originalMax = ''

    def __init__(self, inputCol, outputCol, s_min=0, s_max=0):
        self.mmModel = MinMaxScaler(inputCol=inputCol, outputCol=outputCol)
        self.mmModel.setMin(s_min)
        self.mmModel.setMax(s_max)
        self.in_column = inputCol

    def get_input_col_name(self):
        return self.mmModel.getInputCol()

    def getMax(self):
        return self.mmModel.getMax()

    def getMin(self):
        return self.mmModel.getMin()

    def describe(self):
        print 'describe'

    def fit(self, df):
        col = self.mmModel.getInputCol()
        self.originalMin = df.select(col).rdd.flatMap(lambda x: x[0]).min()
        self.originalMax = df.select(col).rdd.flatMap(lambda x: x[0]).max()
        return self.mmModel.fit(df)

    #denormalize the value
    def denormalize(self, value):
        v = (value - self.getMin()) * (self.originalMax - self.originalMin) * (
            self.getMax() - self.getMin()) + self.originalMin
        if v or v == 0:
            return v
        else:
            return -999

    def denormalize_df(self, df):
        col = self.mmModel.getInputCol()

    def normalize(self, value):
        pass
    1,
    Vectors.dense([2.0, 1.1, 1.0]),
), (
    2,
    Vectors.dense([3.0, 10.1, 3.0]),
)], ["id", "features"])

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

# Compute summary statistics and generate MinMaxScalerModel
scalerModel = scaler.fit(dataFrame)

# rescale each feature to range [min, max].
scaledData = scalerModel.transform(dataFrame)
print("Features scaled to range: [%f, %f]" %
      (scaler.getMin(), scaler.getMax()))
scaledData.select("features", "scaledFeatures").show()

# COMMAND ----------

###MaxAbsScaler (-1, 1)
from pyspark.ml.feature import MaxAbsScaler
from pyspark.ml.linalg import Vectors

dataFrame = spark.createDataFrame([(
    0,
    Vectors.dense([1.0, 0.1, -8.0]),
), (
    1,
    Vectors.dense([2.0, 1.0, -4.0]),
), (
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("MinMaxScalerExample")\
        .getOrCreate()

    # $example on$
    dataFrame = spark.createDataFrame([
        (0, Vectors.dense([1.0, 0.1, -1.0]),),
        (1, Vectors.dense([2.0, 1.1, 1.0]),),
        (2, Vectors.dense([3.0, 10.1, 3.0]),)
    ], ["id", "features"])

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
    print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))
    scaledData.select("features", "scaledFeatures").show()
    # $example off$

    spark.stop()
#####################

########################
## RESCALING DATA SET ##
########################
# Typically for Neural Networks to perform better 
# a lot of preprocessing has to go into the data
# So I scaled the feature space to have min = 0 and max = 1

scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')

scalerModel = scaler.fit(df)

scaledData = scalerModel.transform(df)

print("Features scaled to range: [%f, %f]" % (scaler.getMin(), scaler.getMax()))

scaledData.select("features", "scaledFeatures").show()

new_df = scaledData.selectExpr("label", "radius_mean", "texture_mean", 
	"perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean",
	 "concavity_mean", "concave_points_mean", "symmetry_mean", 
	 "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 
	 "area_se", "smoothness_se", "compactness_se", "concavity_se", 
	 "concave_points_se", "symmetry_se", "fractal_dimension_se", 
	 "radius_worst", "texture_worst", "perimeter_worst", 
	 "area_worst", "smoothness_worst", "compactness_worst", 
	 "concavity_worst", "concave_points_worst", "symmetry_worst", 
	 "fractal_dimension_worst","features as oldFeature", 
	 "scaledFeatures as features")
Exemple #6
0
####################################################################################
## part 2
print('*' * 100)
print('Part 2 - Normalize features between 0 and 1\n')

# assemble features values into a vector and create a feature containing those vectors
assembler = VectorAssembler().setInputCols(
    data.columns[1:]).setOutputCol('features')
transformed = assembler.transform(data)

# create scaler object, transform feature vectors and add scaledFeatures column
scaler = MinMaxScaler(inputCol='features', outputCol='scaledFeatures')
scalerModel = scaler.fit(transformed.select('features'))
scaledData = scalerModel.transform(transformed)

print('Features scaled to range: {} to {}'.format(scaler.getMin(),
                                                  scaler.getMax()))
# print(scaledData.select('_c0','features','scaledFeatures').show(10))

# limit dataset to label and scaled vectors
scaledData = scaledData.select('_c0', 'scaledFeatures')

# rename columns
scaledData = scaledData.withColumnRenamed('_c0', 'label').withColumnRenamed(
    'scaledFeatures', 'features')
print(scaledData.show(5))

####################################################################################
## part 3
print('*' * 100)
print('Part 3 - \n')