def minmax_scale(self, columns='*', Min=0.0, Max=1.0): ''' rescale the columns to range [min,max] Rescaled(e_i) = (e_i - E_min) / (E_max - E_min) * (max - min) + min ''' if columns == "*": columns = self._df.schema.names else: assert isinstance(columns, list), "Error: columns argument must be a list!" assert isinstance(Min, (float, int)), "Error: Min must be numerical" assert isinstance(Max, (float, int)), "Error: Max must be numerical" for column in columns: outputcol = column + '_scaled' assembler = VectorAssembler(inputCols=[column], outputCol='features') df = assembler.transform(self._df) scaler = MinMaxScaler(inputCol='features', outputCol=outputcol) scaler.setMax(Max).setMin(Min) df = scaler.fit(df).transform(df).drop('features') to_float = udf(lambda x: float(x[0])) self._df = df.withColumn(outputcol, to_float(outputcol)) return self._df
def min_max_scale(dataFrame, inputColNames, Min=0.0, Max=1.0): assembledDF = getAssembledDataFrame(dataFrame, inputColNames) scaler=MinMaxScaler(inputCol="features", \ outputCol="scaled features") scaler.setMax(Max).setMin(Min) scalerModel = scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") return scaledDF
def scaling(dataFrame, inputColName, Min, Max): outputColName = "scaled " + inputColName assembler = VectorAssembler(inputCols=[inputColName], \ outputCol="features") assembledDF = assembler.transform(dataFrame) scaler=MinMaxScaler(inputCol="features", \ outputCol=outputColName) scaler.setMax(Max)\ .setMin(Min) scalerModel=scaler.fit(assembledDF) scaledDF = scalerModel.transform(assembledDF).drop("features") castVectorToFloat = udf(lambda v : float(v[0]), FloatType()) scaledDF = scaledDF.withColumn(outputColName, castVectorToFloat(outputColName)) print ("Successfully scale the column '{0:s}' to range ({1:f}, {2:f}) and create a new column '{3:s}'."\ .format(inputColName,scaler.getMin(), scaler.getMax(), outputColName)) return scaledDF
class mmscaler_wrapper(): mmModel = '' originalMin = '' originalMax = '' def __init__(self, inputCol, outputCol, s_min=0, s_max=0): self.mmModel = MinMaxScaler(inputCol=inputCol, outputCol=outputCol) self.mmModel.setMin(s_min) self.mmModel.setMax(s_max) self.in_column = inputCol def get_input_col_name(self): return self.mmModel.getInputCol() def getMax(self): return self.mmModel.getMax() def getMin(self): return self.mmModel.getMin() def describe(self): print 'describe' def fit(self, df): col = self.mmModel.getInputCol() self.originalMin = df.select(col).rdd.flatMap(lambda x: x[0]).min() self.originalMax = df.select(col).rdd.flatMap(lambda x: x[0]).max() return self.mmModel.fit(df) #denormalize the value def denormalize(self, value): v = (value - self.getMin()) * (self.originalMax - self.originalMin) * ( self.getMax() - self.getMin()) + self.originalMin if v or v == 0: return v else: return -999 def denormalize_df(self, df): col = self.mmModel.getInputCol() def normalize(self, value): pass