Exemple #1
0
    def minmax_scale(self, columns='*', Min=0.0, Max=1.0):
        '''
        rescale the columns to range [min,max]
        Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min
        '''
        if columns == "*":
            columns = self._df.schema.names
        else:
            assert isinstance(columns,
                              list), "Error: columns argument must be a list!"

        assert isinstance(Min, (float, int)), "Error: Min must be numerical"
        assert isinstance(Max, (float, int)), "Error: Max must be numerical"

        for column in columns:
            outputcol = column + '_scaled'
            assembler = VectorAssembler(inputCols=[column],
                                        outputCol='features')
            df = assembler.transform(self._df)
            scaler = MinMaxScaler(inputCol='features', outputCol=outputcol)
            scaler.setMax(Max).setMin(Min)
            df = scaler.fit(df).transform(df).drop('features')
            to_float = udf(lambda x: float(x[0]))
            self._df = df.withColumn(outputcol, to_float(outputcol))
        return self._df
Exemple #2
0
def min_max_scale(dataFrame, inputColNames, Min=0.0, Max=1.0):

    assembledDF = getAssembledDataFrame(dataFrame, inputColNames)
    scaler=MinMaxScaler(inputCol="features", \
                        outputCol="scaled features")
    scaler.setMax(Max).setMin(Min)
    scalerModel = scaler.fit(assembledDF)
    scaledDF = scalerModel.transform(assembledDF).drop("features")
    return scaledDF
 def scaling(dataFrame, inputColName, Min, Max):
     outputColName = "scaled " + inputColName
     assembler = VectorAssembler(inputCols=[inputColName], \
                                 outputCol="features")
     assembledDF = assembler.transform(dataFrame)
     scaler=MinMaxScaler(inputCol="features", \
                         outputCol=outputColName)
     scaler.setMax(Max)\
           .setMin(Min)
     scalerModel=scaler.fit(assembledDF)
     scaledDF = scalerModel.transform(assembledDF).drop("features")
     castVectorToFloat = udf(lambda v : float(v[0]), FloatType())
     scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) 
     print ("Successfully scale the column '{0:s}' to range ({1:f}, {2:f}) and create a new column '{3:s}'."\
             .format(inputColName,scaler.getMin(), scaler.getMax(), outputColName))
     return scaledDF
class mmscaler_wrapper():
    mmModel = ''
    originalMin = ''
    originalMax = ''

    def __init__(self, inputCol, outputCol, s_min=0, s_max=0):
        self.mmModel = MinMaxScaler(inputCol=inputCol, outputCol=outputCol)
        self.mmModel.setMin(s_min)
        self.mmModel.setMax(s_max)
        self.in_column = inputCol

    def get_input_col_name(self):
        return self.mmModel.getInputCol()

    def getMax(self):
        return self.mmModel.getMax()

    def getMin(self):
        return self.mmModel.getMin()

    def describe(self):
        print 'describe'

    def fit(self, df):
        col = self.mmModel.getInputCol()
        self.originalMin = df.select(col).rdd.flatMap(lambda x: x[0]).min()
        self.originalMax = df.select(col).rdd.flatMap(lambda x: x[0]).max()
        return self.mmModel.fit(df)

    #denormalize the value
    def denormalize(self, value):
        v = (value - self.getMin()) * (self.originalMax - self.originalMin) * (
            self.getMax() - self.getMin()) + self.originalMin
        if v or v == 0:
            return v
        else:
            return -999

    def denormalize_df(self, df):
        col = self.mmModel.getInputCol()

    def normalize(self, value):
        pass